Old engine for Continuous Time Bayesian Networks. Superseded by reCTBN. 🐍
https://github.com/madlabunimib/PyCTBN
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
592 lines
17 KiB
592 lines
17 KiB
from typing import Optional
|
|
import warnings
|
|
|
|
import numpy as np
|
|
|
|
from pandas._libs.algos import unique_deltas
|
|
from pandas._libs.tslibs import Timestamp, tzconversion
|
|
from pandas._libs.tslibs.ccalendar import (
|
|
DAYS,
|
|
MONTH_ALIASES,
|
|
MONTH_NUMBERS,
|
|
MONTHS,
|
|
int_to_weekday,
|
|
)
|
|
from pandas._libs.tslibs.fields import build_field_sarray, month_position_check
|
|
from pandas._libs.tslibs.offsets import ( # noqa:F401
|
|
DateOffset,
|
|
Day,
|
|
_get_offset,
|
|
to_offset,
|
|
)
|
|
from pandas._libs.tslibs.parsing import get_rule_month
|
|
from pandas.util._decorators import cache_readonly
|
|
|
|
from pandas.core.dtypes.common import (
|
|
is_datetime64_dtype,
|
|
is_period_dtype,
|
|
is_timedelta64_dtype,
|
|
)
|
|
from pandas.core.dtypes.generic import ABCSeries
|
|
|
|
from pandas.core.algorithms import unique
|
|
|
|
_ONE_MICRO = 1000
|
|
_ONE_MILLI = _ONE_MICRO * 1000
|
|
_ONE_SECOND = _ONE_MILLI * 1000
|
|
_ONE_MINUTE = 60 * _ONE_SECOND
|
|
_ONE_HOUR = 60 * _ONE_MINUTE
|
|
_ONE_DAY = 24 * _ONE_HOUR
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Offset names ("time rules") and related functions
|
|
|
|
_offset_to_period_map = {
|
|
"WEEKDAY": "D",
|
|
"EOM": "M",
|
|
"BM": "M",
|
|
"BQS": "Q",
|
|
"QS": "Q",
|
|
"BQ": "Q",
|
|
"BA": "A",
|
|
"AS": "A",
|
|
"BAS": "A",
|
|
"MS": "M",
|
|
"D": "D",
|
|
"C": "C",
|
|
"B": "B",
|
|
"T": "T",
|
|
"S": "S",
|
|
"L": "L",
|
|
"U": "U",
|
|
"N": "N",
|
|
"H": "H",
|
|
"Q": "Q",
|
|
"A": "A",
|
|
"W": "W",
|
|
"M": "M",
|
|
"Y": "A",
|
|
"BY": "A",
|
|
"YS": "A",
|
|
"BYS": "A",
|
|
}
|
|
|
|
_need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"]
|
|
|
|
for _prefix in _need_suffix:
|
|
for _m in MONTHS:
|
|
key = f"{_prefix}-{_m}"
|
|
_offset_to_period_map[key] = _offset_to_period_map[_prefix]
|
|
|
|
for _prefix in ["A", "Q"]:
|
|
for _m in MONTHS:
|
|
_alias = f"{_prefix}-{_m}"
|
|
_offset_to_period_map[_alias] = _alias
|
|
|
|
for _d in DAYS:
|
|
_offset_to_period_map[f"W-{_d}"] = f"W-{_d}"
|
|
|
|
|
|
def get_period_alias(offset_str: str) -> Optional[str]:
|
|
"""
|
|
Alias to closest period strings BQ->Q etc.
|
|
"""
|
|
return _offset_to_period_map.get(offset_str, None)
|
|
|
|
|
|
def get_offset(name: str) -> DateOffset:
|
|
"""
|
|
Return DateOffset object associated with rule name.
|
|
|
|
.. deprecated:: 1.0.0
|
|
|
|
Examples
|
|
--------
|
|
get_offset('EOM') --> BMonthEnd(1)
|
|
"""
|
|
warnings.warn(
|
|
"get_offset is deprecated and will be removed in a future version, "
|
|
"use to_offset instead",
|
|
FutureWarning,
|
|
stacklevel=2,
|
|
)
|
|
return _get_offset(name)
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Period codes
|
|
|
|
|
|
def infer_freq(index, warn: bool = True) -> Optional[str]:
|
|
"""
|
|
Infer the most likely frequency given the input index. If the frequency is
|
|
uncertain, a warning will be printed.
|
|
|
|
Parameters
|
|
----------
|
|
index : DatetimeIndex or TimedeltaIndex
|
|
If passed a Series will use the values of the series (NOT THE INDEX).
|
|
warn : bool, default True
|
|
|
|
Returns
|
|
-------
|
|
str or None
|
|
None if no discernible frequency.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If the index is not datetime-like.
|
|
ValueError
|
|
If there are fewer than three values.
|
|
"""
|
|
import pandas as pd
|
|
|
|
if isinstance(index, ABCSeries):
|
|
values = index._values
|
|
if not (
|
|
is_datetime64_dtype(values)
|
|
or is_timedelta64_dtype(values)
|
|
or values.dtype == object
|
|
):
|
|
raise TypeError(
|
|
"cannot infer freq from a non-convertible dtype "
|
|
f"on a Series of {index.dtype}"
|
|
)
|
|
index = values
|
|
|
|
inferer: _FrequencyInferer
|
|
|
|
if not hasattr(index, "dtype"):
|
|
pass
|
|
elif is_period_dtype(index.dtype):
|
|
raise TypeError(
|
|
"PeriodIndex given. Check the `freq` attribute "
|
|
"instead of using infer_freq."
|
|
)
|
|
elif is_timedelta64_dtype(index.dtype):
|
|
# Allow TimedeltaIndex and TimedeltaArray
|
|
inferer = _TimedeltaFrequencyInferer(index, warn=warn)
|
|
return inferer.get_freq()
|
|
|
|
if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
|
|
if isinstance(index, (pd.Int64Index, pd.Float64Index)):
|
|
raise TypeError(
|
|
f"cannot infer freq from a non-convertible index type {type(index)}"
|
|
)
|
|
index = index._values
|
|
|
|
if not isinstance(index, pd.DatetimeIndex):
|
|
index = pd.DatetimeIndex(index)
|
|
|
|
inferer = _FrequencyInferer(index, warn=warn)
|
|
return inferer.get_freq()
|
|
|
|
|
|
class _FrequencyInferer:
|
|
"""
|
|
Not sure if I can avoid the state machine here
|
|
"""
|
|
|
|
def __init__(self, index, warn: bool = True):
|
|
self.index = index
|
|
self.i8values = index.asi8
|
|
|
|
# This moves the values, which are implicitly in UTC, to the
|
|
# the timezone so they are in local time
|
|
if hasattr(index, "tz"):
|
|
if index.tz is not None:
|
|
self.i8values = tzconversion.tz_convert_from_utc(
|
|
self.i8values, index.tz
|
|
)
|
|
|
|
self.warn = warn
|
|
|
|
if len(index) < 3:
|
|
raise ValueError("Need at least 3 dates to infer frequency")
|
|
|
|
self.is_monotonic = (
|
|
self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing
|
|
)
|
|
|
|
@cache_readonly
|
|
def deltas(self):
|
|
return unique_deltas(self.i8values)
|
|
|
|
@cache_readonly
|
|
def deltas_asi8(self):
|
|
# NB: we cannot use self.i8values here because we may have converted
|
|
# the tz in __init__
|
|
return unique_deltas(self.index.asi8)
|
|
|
|
@cache_readonly
|
|
def is_unique(self) -> bool:
|
|
return len(self.deltas) == 1
|
|
|
|
@cache_readonly
|
|
def is_unique_asi8(self) -> bool:
|
|
return len(self.deltas_asi8) == 1
|
|
|
|
def get_freq(self) -> Optional[str]:
|
|
"""
|
|
Find the appropriate frequency string to describe the inferred
|
|
frequency of self.i8values
|
|
|
|
Returns
|
|
-------
|
|
str or None
|
|
"""
|
|
if not self.is_monotonic or not self.index._is_unique:
|
|
return None
|
|
|
|
delta = self.deltas[0]
|
|
if _is_multiple(delta, _ONE_DAY):
|
|
return self._infer_daily_rule()
|
|
|
|
# Business hourly, maybe. 17: one day / 65: one weekend
|
|
if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
|
|
return "BH"
|
|
# Possibly intraday frequency. Here we use the
|
|
# original .asi8 values as the modified values
|
|
# will not work around DST transitions. See #8772
|
|
elif not self.is_unique_asi8:
|
|
return None
|
|
|
|
delta = self.deltas_asi8[0]
|
|
if _is_multiple(delta, _ONE_HOUR):
|
|
# Hours
|
|
return _maybe_add_count("H", delta / _ONE_HOUR)
|
|
elif _is_multiple(delta, _ONE_MINUTE):
|
|
# Minutes
|
|
return _maybe_add_count("T", delta / _ONE_MINUTE)
|
|
elif _is_multiple(delta, _ONE_SECOND):
|
|
# Seconds
|
|
return _maybe_add_count("S", delta / _ONE_SECOND)
|
|
elif _is_multiple(delta, _ONE_MILLI):
|
|
# Milliseconds
|
|
return _maybe_add_count("L", delta / _ONE_MILLI)
|
|
elif _is_multiple(delta, _ONE_MICRO):
|
|
# Microseconds
|
|
return _maybe_add_count("U", delta / _ONE_MICRO)
|
|
else:
|
|
# Nanoseconds
|
|
return _maybe_add_count("N", delta)
|
|
|
|
@cache_readonly
|
|
def day_deltas(self):
|
|
return [x / _ONE_DAY for x in self.deltas]
|
|
|
|
@cache_readonly
|
|
def hour_deltas(self):
|
|
return [x / _ONE_HOUR for x in self.deltas]
|
|
|
|
@cache_readonly
|
|
def fields(self):
|
|
return build_field_sarray(self.i8values)
|
|
|
|
@cache_readonly
|
|
def rep_stamp(self):
|
|
return Timestamp(self.i8values[0])
|
|
|
|
def month_position_check(self):
|
|
return month_position_check(self.fields, self.index.dayofweek)
|
|
|
|
@cache_readonly
|
|
def mdiffs(self):
|
|
nmonths = self.fields["Y"] * 12 + self.fields["M"]
|
|
return unique_deltas(nmonths.astype("i8"))
|
|
|
|
@cache_readonly
|
|
def ydiffs(self):
|
|
return unique_deltas(self.fields["Y"].astype("i8"))
|
|
|
|
def _infer_daily_rule(self) -> Optional[str]:
|
|
annual_rule = self._get_annual_rule()
|
|
if annual_rule:
|
|
nyears = self.ydiffs[0]
|
|
month = MONTH_ALIASES[self.rep_stamp.month]
|
|
alias = f"{annual_rule}-{month}"
|
|
return _maybe_add_count(alias, nyears)
|
|
|
|
quarterly_rule = self._get_quarterly_rule()
|
|
if quarterly_rule:
|
|
nquarters = self.mdiffs[0] / 3
|
|
mod_dict = {0: 12, 2: 11, 1: 10}
|
|
month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
|
|
alias = f"{quarterly_rule}-{month}"
|
|
return _maybe_add_count(alias, nquarters)
|
|
|
|
monthly_rule = self._get_monthly_rule()
|
|
if monthly_rule:
|
|
return _maybe_add_count(monthly_rule, self.mdiffs[0])
|
|
|
|
if self.is_unique:
|
|
days = self.deltas[0] / _ONE_DAY
|
|
if days % 7 == 0:
|
|
# Weekly
|
|
day = int_to_weekday[self.rep_stamp.weekday()]
|
|
return _maybe_add_count(f"W-{day}", days / 7)
|
|
else:
|
|
return _maybe_add_count("D", days)
|
|
|
|
if self._is_business_daily():
|
|
return "B"
|
|
|
|
wom_rule = self._get_wom_rule()
|
|
if wom_rule:
|
|
return wom_rule
|
|
|
|
return None
|
|
|
|
def _get_annual_rule(self) -> Optional[str]:
|
|
if len(self.ydiffs) > 1:
|
|
return None
|
|
|
|
if len(unique(self.fields["M"])) > 1:
|
|
return None
|
|
|
|
pos_check = self.month_position_check()
|
|
return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check)
|
|
|
|
def _get_quarterly_rule(self) -> Optional[str]:
|
|
if len(self.mdiffs) > 1:
|
|
return None
|
|
|
|
if not self.mdiffs[0] % 3 == 0:
|
|
return None
|
|
|
|
pos_check = self.month_position_check()
|
|
return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check)
|
|
|
|
def _get_monthly_rule(self) -> Optional[str]:
|
|
if len(self.mdiffs) > 1:
|
|
return None
|
|
pos_check = self.month_position_check()
|
|
return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check)
|
|
|
|
def _is_business_daily(self) -> bool:
|
|
# quick check: cannot be business daily
|
|
if self.day_deltas != [1, 3]:
|
|
return False
|
|
|
|
# probably business daily, but need to confirm
|
|
first_weekday = self.index[0].weekday()
|
|
shifts = np.diff(self.index.asi8)
|
|
shifts = np.floor_divide(shifts, _ONE_DAY)
|
|
weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
|
|
return np.all(
|
|
((weekdays == 0) & (shifts == 3))
|
|
| ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))
|
|
)
|
|
|
|
def _get_wom_rule(self) -> Optional[str]:
|
|
# FIXME: dont leave commented-out
|
|
# wdiffs = unique(np.diff(self.index.week))
|
|
# We also need -47, -49, -48 to catch index spanning year boundary
|
|
# if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
|
|
# return None
|
|
|
|
weekdays = unique(self.index.weekday)
|
|
if len(weekdays) > 1:
|
|
return None
|
|
|
|
week_of_months = unique((self.index.day - 1) // 7)
|
|
# Only attempt to infer up to WOM-4. See #9425
|
|
week_of_months = week_of_months[week_of_months < 4]
|
|
if len(week_of_months) == 0 or len(week_of_months) > 1:
|
|
return None
|
|
|
|
# get which week
|
|
week = week_of_months[0] + 1
|
|
wd = int_to_weekday[weekdays[0]]
|
|
|
|
return f"WOM-{week}{wd}"
|
|
|
|
|
|
class _TimedeltaFrequencyInferer(_FrequencyInferer):
|
|
def _infer_daily_rule(self):
|
|
if self.is_unique:
|
|
days = self.deltas[0] / _ONE_DAY
|
|
if days % 7 == 0:
|
|
# Weekly
|
|
wd = int_to_weekday[self.rep_stamp.weekday()]
|
|
alias = f"W-{wd}"
|
|
return _maybe_add_count(alias, days / 7)
|
|
else:
|
|
return _maybe_add_count("D", days)
|
|
|
|
|
|
def _is_multiple(us, mult: int) -> bool:
|
|
return us % mult == 0
|
|
|
|
|
|
def _maybe_add_count(base: str, count: float) -> str:
|
|
if count != 1:
|
|
assert count == int(count)
|
|
count = int(count)
|
|
return f"{count}{base}"
|
|
else:
|
|
return base
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Frequency comparison
|
|
|
|
|
|
def is_subperiod(source, target) -> bool:
|
|
"""
|
|
Returns True if downsampling is possible between source and target
|
|
frequencies
|
|
|
|
Parameters
|
|
----------
|
|
source : str or DateOffset
|
|
Frequency converting from
|
|
target : str or DateOffset
|
|
Frequency converting to
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
|
|
if target is None or source is None:
|
|
return False
|
|
source = _maybe_coerce_freq(source)
|
|
target = _maybe_coerce_freq(target)
|
|
|
|
if _is_annual(target):
|
|
if _is_quarterly(source):
|
|
return _quarter_months_conform(
|
|
get_rule_month(source), get_rule_month(target)
|
|
)
|
|
return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
|
|
elif _is_quarterly(target):
|
|
return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
|
|
elif _is_monthly(target):
|
|
return source in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif _is_weekly(target):
|
|
return source in {target, "D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif target == "B":
|
|
return source in {"B", "H", "T", "S", "L", "U", "N"}
|
|
elif target == "C":
|
|
return source in {"C", "H", "T", "S", "L", "U", "N"}
|
|
elif target == "D":
|
|
return source in {"D", "H", "T", "S", "L", "U", "N"}
|
|
elif target == "H":
|
|
return source in {"H", "T", "S", "L", "U", "N"}
|
|
elif target == "T":
|
|
return source in {"T", "S", "L", "U", "N"}
|
|
elif target == "S":
|
|
return source in {"S", "L", "U", "N"}
|
|
elif target == "L":
|
|
return source in {"L", "U", "N"}
|
|
elif target == "U":
|
|
return source in {"U", "N"}
|
|
elif target == "N":
|
|
return source in {"N"}
|
|
else:
|
|
return False
|
|
|
|
|
|
def is_superperiod(source, target) -> bool:
|
|
"""
|
|
Returns True if upsampling is possible between source and target
|
|
frequencies
|
|
|
|
Parameters
|
|
----------
|
|
source : str or DateOffset
|
|
Frequency converting from
|
|
target : str or DateOffset
|
|
Frequency converting to
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
if target is None or source is None:
|
|
return False
|
|
source = _maybe_coerce_freq(source)
|
|
target = _maybe_coerce_freq(target)
|
|
|
|
if _is_annual(source):
|
|
if _is_annual(target):
|
|
return get_rule_month(source) == get_rule_month(target)
|
|
|
|
if _is_quarterly(target):
|
|
smonth = get_rule_month(source)
|
|
tmonth = get_rule_month(target)
|
|
return _quarter_months_conform(smonth, tmonth)
|
|
return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
|
|
elif _is_quarterly(source):
|
|
return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
|
|
elif _is_monthly(source):
|
|
return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif _is_weekly(source):
|
|
return target in {source, "D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif source == "B":
|
|
return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif source == "C":
|
|
return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif source == "D":
|
|
return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
|
|
elif source == "H":
|
|
return target in {"H", "T", "S", "L", "U", "N"}
|
|
elif source == "T":
|
|
return target in {"T", "S", "L", "U", "N"}
|
|
elif source == "S":
|
|
return target in {"S", "L", "U", "N"}
|
|
elif source == "L":
|
|
return target in {"L", "U", "N"}
|
|
elif source == "U":
|
|
return target in {"U", "N"}
|
|
elif source == "N":
|
|
return target in {"N"}
|
|
else:
|
|
return False
|
|
|
|
|
|
def _maybe_coerce_freq(code) -> str:
|
|
""" we might need to coerce a code to a rule_code
|
|
and uppercase it
|
|
|
|
Parameters
|
|
----------
|
|
source : string or DateOffset
|
|
Frequency converting from
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
"""
|
|
assert code is not None
|
|
if isinstance(code, DateOffset):
|
|
code = code.rule_code
|
|
return code.upper()
|
|
|
|
|
|
def _quarter_months_conform(source: str, target: str) -> bool:
|
|
snum = MONTH_NUMBERS[source]
|
|
tnum = MONTH_NUMBERS[target]
|
|
return snum % 3 == tnum % 3
|
|
|
|
|
|
def _is_annual(rule: str) -> bool:
|
|
rule = rule.upper()
|
|
return rule == "A" or rule.startswith("A-")
|
|
|
|
|
|
def _is_quarterly(rule: str) -> bool:
|
|
rule = rule.upper()
|
|
return rule == "Q" or rule.startswith("Q-") or rule.startswith("BQ")
|
|
|
|
|
|
def _is_monthly(rule: str) -> bool:
|
|
rule = rule.upper()
|
|
return rule == "M" or rule == "BM"
|
|
|
|
|
|
def _is_weekly(rule: str) -> bool:
|
|
rule = rule.upper()
|
|
return rule == "W" or rule.startswith("W-")
|
|
|