Skip to content

Commit ec5492f

Browse files
authored
[python-package] drop support for h2o datatable (#6894)
1 parent a725360 commit ec5492f

File tree

6 files changed

+16
-95
lines changed

6 files changed

+16
-95
lines changed

docs/Python-Intro.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ The LightGBM Python module can load data from:
3535

3636
- LibSVM (zero-based) / TSV / CSV format text file
3737

38-
- NumPy 2D array(s), pandas DataFrame, H2O DataTable's Frame (deprecated), SciPy sparse matrix
38+
- NumPy 2D array(s), pandas DataFrame, pyarrow Table, SciPy sparse matrix
3939

4040
- LightGBM binary file
4141

docs/conf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ def run(self) -> List:
100100
autodoc_mock_imports = [
101101
"dask",
102102
"dask.distributed",
103-
"datatable",
104103
"graphviz",
105104
"matplotlib",
106105
"numpy",

python-package/lightgbm/basic.py

Lines changed: 6 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
arrow_is_floating,
3636
arrow_is_integer,
3737
concat,
38-
dt_DataTable,
3938
pa_Array,
4039
pa_chunked_array,
4140
pa_ChunkedArray,
@@ -116,7 +115,6 @@
116115
Path,
117116
np.ndarray,
118117
pd_DataFrame,
119-
dt_DataTable,
120118
scipy.sparse.spmatrix,
121119
"Sequence",
122120
List["Sequence"],
@@ -137,7 +135,6 @@
137135
Path,
138136
np.ndarray,
139137
pd_DataFrame,
140-
dt_DataTable,
141138
scipy.sparse.spmatrix,
142139
pa_Table,
143140
]
@@ -577,15 +574,6 @@ class LGBMDeprecationWarning(FutureWarning):
577574
pass
578575

579576

580-
def _emit_datatable_deprecation_warning() -> None:
581-
msg = (
582-
"Support for 'datatable' in LightGBM is deprecated, and will be removed in a future release. "
583-
"To avoid this warning, convert 'datatable' inputs to a supported format "
584-
"(for example, use the 'to_numpy()' method)."
585-
)
586-
warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)
587-
588-
589577
class _ConfigAliases:
590578
# lazy evaluation to allow import without dynamic library, e.g., for docs generation
591579
aliases = None
@@ -1112,7 +1100,7 @@ def predict(
11121100
11131101
Parameters
11141102
----------
1115-
data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame (deprecated) or scipy.sparse
1103+
data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table or scipy.sparse
11161104
Data source for prediction.
11171105
If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
11181106
start_iteration : int, optional (default=0)
@@ -1225,14 +1213,6 @@ def predict(
12251213
num_iteration=num_iteration,
12261214
predict_type=predict_type,
12271215
)
1228-
elif isinstance(data, dt_DataTable):
1229-
_emit_datatable_deprecation_warning()
1230-
preds, nrow = self.__pred_for_np2d(
1231-
mat=data.to_numpy(),
1232-
start_iteration=start_iteration,
1233-
num_iteration=num_iteration,
1234-
predict_type=predict_type,
1235-
)
12361216
else:
12371217
try:
12381218
_log_warning("Converting data to scipy sparse matrix.")
@@ -1790,7 +1770,7 @@ def __init__(
17901770
17911771
Parameters
17921772
----------
1793-
data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table
1773+
data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table
17941774
Data source of Dataset.
17951775
If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
17961776
label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
@@ -2196,9 +2176,6 @@ def _lazy_init(
21962176
raise TypeError("Data list can only be of ndarray or Sequence")
21972177
elif isinstance(data, Sequence):
21982178
self.__init_from_seqs([data], ref_dataset)
2199-
elif isinstance(data, dt_DataTable):
2200-
_emit_datatable_deprecation_warning()
2201-
self.__init_from_np2d(data.to_numpy(), params_str, ref_dataset)
22022179
else:
22032180
try:
22042181
csr = scipy.sparse.csr_matrix(data)
@@ -2619,7 +2596,7 @@ def create_valid(
26192596
26202597
Parameters
26212598
----------
2622-
data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence or list of numpy array
2599+
data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence or list of numpy array
26232600
Data source of Dataset.
26242601
If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
26252602
label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
@@ -3276,7 +3253,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
32763253
32773254
Returns
32783255
-------
3279-
data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence or list of numpy array or None
3256+
data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence or list of numpy array or None
32803257
Raw data used in the Dataset construction.
32813258
"""
32823259
if self._handle is None:
@@ -3288,9 +3265,6 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
32883265
self.data = self.data[self.used_indices, :]
32893266
elif isinstance(self.data, pd_DataFrame):
32903267
self.data = self.data.iloc[self.used_indices].copy()
3291-
elif isinstance(self.data, dt_DataTable):
3292-
_emit_datatable_deprecation_warning()
3293-
self.data = self.data[self.used_indices, :]
32943268
elif isinstance(self.data, Sequence):
32953269
self.data = self.data[self.used_indices]
32963270
elif _is_list_of_sequences(self.data) and len(self.data) > 0:
@@ -3477,9 +3451,6 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
34773451
self.data = np.hstack((self.data, other.data.toarray()))
34783452
elif isinstance(other.data, pd_DataFrame):
34793453
self.data = np.hstack((self.data, other.data.values))
3480-
elif isinstance(other.data, dt_DataTable):
3481-
_emit_datatable_deprecation_warning()
3482-
self.data = np.hstack((self.data, other.data.to_numpy()))
34833454
else:
34843455
self.data = None
34853456
elif isinstance(self.data, scipy.sparse.spmatrix):
@@ -3488,9 +3459,6 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
34883459
self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format)
34893460
elif isinstance(other.data, pd_DataFrame):
34903461
self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format)
3491-
elif isinstance(other.data, dt_DataTable):
3492-
_emit_datatable_deprecation_warning()
3493-
self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
34943462
else:
34953463
self.data = None
34963464
elif isinstance(self.data, pd_DataFrame):
@@ -3506,21 +3474,6 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
35063474
self.data = concat((self.data, pd_DataFrame(other.data.toarray())), axis=1, ignore_index=True)
35073475
elif isinstance(other.data, pd_DataFrame):
35083476
self.data = concat((self.data, other.data), axis=1, ignore_index=True)
3509-
elif isinstance(other.data, dt_DataTable):
3510-
_emit_datatable_deprecation_warning()
3511-
self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), axis=1, ignore_index=True)
3512-
else:
3513-
self.data = None
3514-
elif isinstance(self.data, dt_DataTable):
3515-
_emit_datatable_deprecation_warning()
3516-
if isinstance(other.data, np.ndarray):
3517-
self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data)))
3518-
elif isinstance(other.data, scipy.sparse.spmatrix):
3519-
self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.toarray())))
3520-
elif isinstance(other.data, pd_DataFrame):
3521-
self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
3522-
elif isinstance(other.data, dt_DataTable):
3523-
self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
35243477
else:
35253478
self.data = None
35263479
else:
@@ -4717,7 +4670,7 @@ def predict(
47174670
47184671
Parameters
47194672
----------
4720-
data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame (deprecated) or scipy.sparse
4673+
data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table or scipy.sparse
47214674
Data source for prediction.
47224675
If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
47234676
start_iteration : int, optional (default=0)
@@ -4798,7 +4751,7 @@ def refit(
47984751
47994752
Parameters
48004753
----------
4801-
data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence or list of numpy array
4754+
data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence or list of numpy array
48024755
Data source for refit.
48034756
If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
48044757
label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array or pyarrow ChunkedArray

python-package/lightgbm/compat.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -202,25 +202,6 @@ def __init__(self, *args: Any, **kwargs: Any):
202202
except ImportError:
203203
GRAPHVIZ_INSTALLED = False
204204

205-
"""datatable"""
206-
try:
207-
import datatable
208-
209-
if hasattr(datatable, "Frame"):
210-
dt_DataTable = datatable.Frame
211-
else:
212-
dt_DataTable = datatable.DataTable
213-
DATATABLE_INSTALLED = True
214-
except ImportError:
215-
DATATABLE_INSTALLED = False
216-
217-
class dt_DataTable: # type: ignore
218-
"""Dummy class for datatable.DataTable."""
219-
220-
def __init__(self, *args: Any, **kwargs: Any):
221-
pass
222-
223-
224205
"""dask"""
225206
try:
226207
from dask import delayed

python-package/lightgbm/sklearn.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
_LGBMRegressorBase,
4242
_LGBMValidateData,
4343
_sklearn_version,
44-
dt_DataTable,
4544
pd_DataFrame,
4645
)
4746
from .engine import train
@@ -58,7 +57,6 @@
5857
]
5958

6059
_LGBM_ScikitMatrixLike = Union[
61-
dt_DataTable,
6260
List[Union[List[float], List[int]]],
6361
np.ndarray,
6462
pd_DataFrame,
@@ -945,7 +943,7 @@ def fit(
945943
params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
946944
params["metric"] = [metric for metric in params["metric"] if metric is not None]
947945

948-
if not isinstance(X, (pd_DataFrame, dt_DataTable)):
946+
if not isinstance(X, pd_DataFrame):
949947
_X, _y = _LGBMValidateData(
950948
self,
951949
X,
@@ -1077,7 +1075,7 @@ def fit(
10771075

10781076
fit.__doc__ = (
10791077
_lgbmmodel_doc_fit.format(
1080-
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
1078+
X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
10811079
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
10821080
sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
10831081
init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
@@ -1104,7 +1102,7 @@ def predict(
11041102
"""Docstring is set after definition, using a template."""
11051103
if not self.__sklearn_is_fitted__():
11061104
raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
1107-
if not isinstance(X, (pd_DataFrame, dt_DataTable)):
1105+
if not isinstance(X, pd_DataFrame):
11081106
X = _LGBMValidateData(
11091107
self,
11101108
X,
@@ -1154,7 +1152,7 @@ def predict(
11541152

11551153
predict.__doc__ = _lgbmmodel_doc_predict.format(
11561154
description="Return the predicted value for each sample.",
1157-
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
1155+
X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
11581156
output_name="predicted_result",
11591157
predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
11601158
X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
@@ -1648,7 +1646,7 @@ def predict_proba(
16481646

16491647
predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
16501648
description="Return the predicted probability for each class for each sample.",
1651-
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
1649+
X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
16521650
output_name="predicted_probability",
16531651
predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
16541652
X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",

tests/python_package_test/test_sklearn.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,8 @@
2424
import lightgbm as lgb
2525
from lightgbm.compat import (
2626
DASK_INSTALLED,
27-
DATATABLE_INSTALLED,
2827
PANDAS_INSTALLED,
2928
_sklearn_version,
30-
dt_DataTable,
3129
pd_DataFrame,
3230
pd_Series,
3331
)
@@ -1883,14 +1881,12 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
18831881
assert preds.shape[0] == y.shape[0]
18841882

18851883

1886-
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
1884+
@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
18871885
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
18881886
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
18891887
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
18901888
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
18911889
pytest.skip("pandas is not installed")
1892-
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
1893-
pytest.skip("datatable is not installed")
18941890
X, y, g = _create_data(task, n_samples=2_000)
18951891
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
18961892

@@ -1902,9 +1898,7 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
19021898
raise ValueError(f"Unrecognized task '{task}'")
19031899

19041900
X_valid = X * 2
1905-
if X_type == "dt_DataTable":
1906-
X = dt_DataTable(X)
1907-
elif X_type == "list2d":
1901+
if X_type == "list2d":
19081902
X = X.tolist()
19091903
elif X_type == "scipy_csc":
19101904
X = scipy.sparse.csc_matrix(X)
@@ -1960,22 +1954,18 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
19601954
raise ValueError(f"Unrecognized task: '{task}'")
19611955

19621956

1963-
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
1957+
@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
19641958
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
19651959
@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
19661960
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
19671961
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
19681962
pytest.skip("pandas is not installed")
1969-
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
1970-
pytest.skip("datatable is not installed")
19711963
X, y, g = _create_data(task="ranking", n_samples=1_000)
19721964
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
19731965
init_score = np.full_like(y, np.mean(y))
19741966
X_valid = X * 2
19751967

1976-
if X_type == "dt_DataTable":
1977-
X = dt_DataTable(X)
1978-
elif X_type == "list2d":
1968+
if X_type == "list2d":
19791969
X = X.tolist()
19801970
elif X_type == "scipy_csc":
19811971
X = scipy.sparse.csc_matrix(X)

0 commit comments

Comments
 (0)