Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 80 additions & 14 deletions src/pymor/core/infer_freq.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,34 @@ def _infer_frequency_core(
return FrequencyResult(None, None, None, False, error_status)
return None
deltas = np.diff(ordinals)
median_delta = np.median(deltas)
std_delta = np.std(deltas)

# Check if there are any zero deltas (duplicates) in the original data
has_duplicates = np.any(deltas <= 1e-10)

# Filter out zero deltas (duplicates) to avoid them dominating the frequency inference
non_zero_deltas = deltas[
deltas > 1e-10
] # Use small epsilon to handle floating point precision

if len(non_zero_deltas) == 0:
# All deltas are zero (all duplicates) - cannot infer frequency
if log:
log_frequency_check(
"Time Series", None, 0.0, None, False, "all_duplicates", strict
)
return (
FrequencyResult(None, 0.0, None, False, "all_duplicates")
if return_metadata
else None
)

# To handle irregular time series with gaps, find the most common delta from non-zero deltas.
# We round the deltas to a reasonable precision to group similar values.
rounded_deltas = np.round(non_zero_deltas, decimals=2)
unique_deltas, counts = np.unique(rounded_deltas, return_counts=True)
most_common_delta_index = np.argmax(counts)
median_delta = float(unique_deltas[most_common_delta_index])
std_delta = np.std(non_zero_deltas)

days_in_calendar_year = {
"standard": 365.25,
Expand All @@ -179,15 +205,25 @@ def _infer_frequency_core(

matched_freq = None
matched_step = None
for freq, base_days in base_freqs.items():
for step in range(1, 13):
test_delta = base_days * step
if abs(median_delta - test_delta) <= tol * test_delta:
matched_freq = freq
matched_step = step
# Prioritize monthly check to avoid incorrect '4W' match
monthly_days = base_freqs["M"]
# Use a larger tolerance for monthly checks to account for varying month lengths
if abs(median_delta - monthly_days) <= 0.15 * monthly_days:
matched_freq = "M"
matched_step = 1
else:
for freq, base_days in base_freqs.items():
# Skip monthly check as it's already done
if freq == "M":
continue
for step in range(1, 13):
test_delta = base_days * step
if abs(median_delta - test_delta) <= tol * test_delta:
matched_freq = freq
matched_step = step
break
if matched_freq:
break
if matched_freq:
break

if matched_freq is None:
# For irregular time series, try to find the closest match with relaxed tolerance
Expand All @@ -213,8 +249,14 @@ def _infer_frequency_core(
else None
)

is_exact = std_delta < tol * (base_freqs[matched_freq] * matched_step)
status = "valid" if is_exact else "irregular"
is_exact = bool(std_delta < tol * (base_freqs[matched_freq] * matched_step))

# If there are duplicates in the original data, mark as irregular regardless of frequency match
if has_duplicates:
status = "irregular"
is_exact = False
else:
status = "valid" if is_exact else "irregular"

if strict:
expected_steps = (ordinals[-1] - ordinals[0]) / (
Expand Down Expand Up @@ -285,10 +327,32 @@ def infer_frequency(
try:
freq = xr.infer_freq(times_values)
if freq is not None:
# Calculate delta_days even when xarray.infer_freq succeeds
delta_days = None
if return_metadata and len(times_values) >= 2:
try:
ordinals = _convert_times_to_ordinals(times_values)
deltas = np.diff(ordinals)
# Filter out zero deltas and calculate median
non_zero_deltas = deltas[deltas > 1e-10]
if len(non_zero_deltas) > 0:
# Use the most common delta (similar to _infer_frequency_core)
rounded_deltas = np.round(non_zero_deltas, decimals=2)
unique_deltas, counts = np.unique(
rounded_deltas, return_counts=True
)
most_common_delta_index = np.argmax(counts)
delta_days = float(unique_deltas[most_common_delta_index])
except Exception:
# If delta calculation fails, keep delta_days as None
pass

if log:
log_frequency_check("Time Series", freq, None, 1, True, "valid", strict)
log_frequency_check(
"Time Series", freq, delta_days, 1, True, "valid", strict
)
return (
FrequencyResult(freq, None, 1, True, "valid")
FrequencyResult(freq, delta_days, 1, True, "valid")
if return_metadata
else freq
)
Expand Down Expand Up @@ -513,6 +577,7 @@ def is_resolution_fine_enough(
"inferred_interval": None,
"comparison_status": "unknown",
"is_valid_for_resampling": False,
"status": "unknown",
}

freq = result.frequency
Expand All @@ -531,6 +596,7 @@ def is_resolution_fine_enough(
"inferred_interval": None,
"comparison_status": status,
"is_valid_for_resampling": False,
"status": status,
}

comparison_status = status
Expand Down
98 changes: 64 additions & 34 deletions tests/unit/test_infer_freq.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import cftime
import numpy as np
import pandas as pd
import pytest
import xarray as xr

Expand Down Expand Up @@ -442,9 +443,10 @@ def test_accessor_invalid_manual_time_dim():
"""Test behavior when manually specified time_dim doesn't exist."""
da = xr.DataArray([1, 2, 3], coords={"x": [1, 2, 3]}, dims=["x"])

# When time_dim doesn't exist, it should return a result with no_match status
# When time_dim doesn't exist, it uses the coordinate values [1, 2, 3] which
# get converted to ordinals with zero deltas, hence "all_duplicates"
result = da.timefreq.infer_frequency(time_dim="nonexistent")
assert result.status == "no_match"
assert result.status == "all_duplicates"


def test_dataset_accessor_no_datetime_coord_error():
Expand Down Expand Up @@ -510,8 +512,6 @@ def test_numpy_datetime64_with_different_units():

def test_resample_safe_error_paths():
"""Test error paths in resample_safe methods."""
import pandas as pd

# Create a coarse time series (quarterly)
coarse_times = pd.date_range("2000-01-01", periods=4, freq="QS")
da = xr.DataArray([1, 2, 3, 4], coords={"time": coarse_times}, dims=["time"])
Expand Down Expand Up @@ -558,8 +558,6 @@ def test_log_frequency_check_function():

def test_pandas_datetime_index_input():
"""Test with pandas DatetimeIndex input."""
import pandas as pd

# Test with pandas DatetimeIndex
times_index = pd.date_range("2000-01-01", periods=5, freq="D")
result = infer_frequency(times_index, return_metadata=True)
Expand All @@ -569,8 +567,6 @@ def test_pandas_datetime_index_input():

def test_get_time_label_dataset_with_time_coord():
"""Test get_time_label with Dataset containing 'time' coordinate."""
import pandas as pd

# Create dataset with time coordinate
time_coord = pd.date_range("2000-01-01", periods=10)
ds = xr.Dataset(
Expand All @@ -583,8 +579,6 @@ def test_get_time_label_dataset_with_time_coord():

def test_get_time_label_dataset_with_custom_time_coord():
"""Test get_time_label with Dataset containing custom time coordinate name."""
import pandas as pd

# Create dataset with 'T' as time coordinate
time_coord = pd.date_range("2000-01-01", periods=5)
ds = xr.Dataset({"data": (["T"], np.random.rand(5))}, coords={"T": time_coord})
Expand All @@ -595,8 +589,6 @@ def test_get_time_label_dataset_with_custom_time_coord():

def test_get_time_label_dataarray_with_time_coord():
"""Test get_time_label with DataArray containing time coordinate."""
import pandas as pd

# Create DataArray with time coordinate
time_coord = pd.date_range("2000-01-01", periods=8)
da = xr.DataArray(np.random.rand(8), coords={"time": time_coord}, dims=["time"])
Expand All @@ -607,8 +599,6 @@ def test_get_time_label_dataarray_with_time_coord():

def test_get_time_label_dataarray_with_custom_time_coord():
"""Test get_time_label with DataArray containing custom time coordinate name."""
import pandas as pd

# Create DataArray with 'T' as time coordinate
time_coord = pd.date_range("2000-01-01", periods=6)
da = xr.DataArray(np.random.rand(6), coords={"T": time_coord}, dims=["T"])
Expand Down Expand Up @@ -654,8 +644,6 @@ def test_get_time_label_dataset_with_non_datetime_time_coord():

def test_get_time_label_multiple_datetime_coords():
"""Test get_time_label with multiple datetime coordinates."""
import pandas as pd

# Create dataset with multiple datetime coordinates
# The function uses appendleft(), so the last processed coord gets priority
time1 = pd.date_range("2000-01-01", periods=3)
Expand All @@ -677,8 +665,6 @@ def test_get_time_label_multiple_datetime_coords():

def test_get_time_label_datetime_coord_not_used_by_datavar():
"""Test get_time_label when datetime coord exists but not used by data variables."""
import pandas as pd

# Create dataset where datetime coord exists but no data variable uses it
time_coord = pd.date_range("2000-01-01", periods=5)
ds = xr.Dataset(
Expand All @@ -692,8 +678,6 @@ def test_get_time_label_datetime_coord_not_used_by_datavar():

def test_get_time_label_scalar_datetime_coord():
"""Test get_time_label with scalar datetime coordinate (no dimensions)."""
import pandas as pd

# Create dataset with scalar datetime coordinate
ds = xr.Dataset(
{"data": (["x"], np.random.rand(3))},
Expand Down Expand Up @@ -925,10 +909,16 @@ def test_check_resolution_with_strict_mode():
assert "status" in result_non_strict


def test_infer_freq_with_missing_month():
"""Test that infer_frequency can handle a single missing month."""
# Monthly data, but March is missing
times = pd.to_datetime(["2000-01-31", "2000-02-29", "2000-04-30"])
freq = infer_frequency(times)
assert freq == "M", f"Expected 'M', but got {freq}"


def test_check_resolution_with_pandas_datetime():
"""Test check_resolution with pandas datetime objects."""
import pandas as pd

# Create monthly time series with pandas datetime
times = pd.date_range("2000-01-01", periods=3, freq="MS")
da = xr.DataArray([1, 2, 3], coords={"time": times}, dims="time")
Expand Down Expand Up @@ -971,27 +961,67 @@ def test_check_resolution_tolerance_parameter():

def test_check_resolution_return_format():
"""Test that check_resolution returns the expected dictionary format."""
times = [
cftime.Datetime360Day(2000, 1, 1),
cftime.Datetime360Day(2000, 2, 1),
cftime.Datetime360Day(2000, 3, 1),
]
da = xr.DataArray([1, 2, 3], coords={"time": times}, dims="time")
times = pd.date_range("2000-01-01", periods=12, freq="M")

result = da.timefreq.check_resolution(target_approx_interval=30.0, log=False)
result = is_resolution_fine_enough(times, target_approx_interval=30.0, log=False)

# Check that all expected keys are present
expected_keys = [
expected_keys = {
"inferred_interval",
"comparison_status",
"is_valid_for_resampling",
"status",
]
for key in expected_keys:
assert key in result
}
assert set(result.keys()) >= expected_keys

# Check data types
# Check types
assert isinstance(result["inferred_interval"], (float, type(None)))
assert isinstance(result["comparison_status"], str)
assert isinstance(result["is_valid_for_resampling"], bool)
assert isinstance(result["status"], str)


def test_infer_frequency_with_duplicates():
"""Test that infer_frequency correctly handles duplicate timestamps."""
import cftime

# Test case 1: Monthly data with duplicates should return 'M'
monthly_with_duplicates = [
cftime.Datetime360Day(2000, 1, 16, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 1, 16, 0, 0, 0, 0, has_year_zero=True), # duplicate
cftime.Datetime360Day(2000, 2, 16, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 2, 16, 0, 0, 0, 0, has_year_zero=True), # duplicate
cftime.Datetime360Day(2000, 3, 16, 0, 0, 0, 0, has_year_zero=True),
]

result = infer_frequency(monthly_with_duplicates, return_metadata=True)
assert result.frequency == "M"
assert result.delta_days == 30.0
assert result.status == "irregular" # Should be irregular due to duplicates
assert result.is_exact is False

# Test case 2: Daily data with duplicates should return 'D'
daily_with_duplicates = [
cftime.Datetime360Day(2000, 1, 1, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 1, 1, 0, 0, 0, 0, has_year_zero=True), # duplicate
cftime.Datetime360Day(2000, 1, 2, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 1, 3, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 1, 3, 0, 0, 0, 0, has_year_zero=True), # duplicate
cftime.Datetime360Day(2000, 1, 4, 0, 0, 0, 0, has_year_zero=True),
]

result = infer_frequency(daily_with_duplicates, return_metadata=True)
assert result.frequency == "D"
assert result.delta_days == 1.0

# Test case 3: All duplicates should return 'all_duplicates' status
all_duplicates = [
cftime.Datetime360Day(2000, 1, 1, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 1, 1, 0, 0, 0, 0, has_year_zero=True),
cftime.Datetime360Day(2000, 1, 1, 0, 0, 0, 0, has_year_zero=True),
]

result = infer_frequency(all_duplicates, return_metadata=True)
assert result.frequency is None
assert result.delta_days == 0.0
assert result.status == "all_duplicates"
Loading