Commit b41c932e authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Redo lost changes

parent 8d4ead19
......@@ -9,8 +9,9 @@ from pandas.io.formats import format as pandas_format
import dask
import dask.array as da
from dask.array.numpy_compat import _numpy_118
from dask.array.numpy_compat import _numpy_118, _numpy_120
import dask.dataframe as dd
from dask.blockwise import fuse_roots
from dask.dataframe import _compat
from dask.dataframe._compat import tm, PANDAS_GT_100, PANDAS_GT_110
from dask.base import compute_as_if_collection
......@@ -22,7 +23,6 @@ from dask.dataframe.core import (
_concat,
Scalar,
has_parallel_type,
iter_chunks,
total_mem_usage,
is_broadcastable,
)
......@@ -37,6 +37,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
CHECK_FREQ["check_freq"] = False
def test_dataframe_doc():
......@@ -222,7 +225,18 @@ def test_index_names():
assert ddf.index.compute().name == "x"
@pytest.mark.parametrize("npartitions", [1, pytest.param(2, marks=pytest.mark.xfail)])
@pytest.mark.parametrize(
"npartitions",
[
1,
pytest.param(
2,
marks=pytest.mark.xfail(
not dd._compat.PANDAS_GT_110, reason="Fixed upstream."
),
),
],
)
def test_timezone_freq(npartitions):
s_naive = pd.Series(pd.date_range("20130101", periods=10))
s_aware = pd.Series(pd.date_range("20130101", periods=10, tz="US/Eastern"))
......@@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values):
(None, None, None, ["c", "d", "g"]), # numeric + bool
(None, None, None, ["c", "d", "f", "g"]), # numeric + bool + timedelta
(None, None, None, ["f", "g"]), # bool + timedelta
("all", None, None, None),
(["number"], None, [0.25, 0.5], None),
([np.timedelta64], None, None, None),
(["number", "object"], None, [0.25, 0.75], None),
(None, ["number", "object"], None, None),
(["object", "datetime", "bool"], None, None, None),
pytest.param(
"all",
None,
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
["number"],
None,
[0.25, 0.5],
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
[np.timedelta64],
None,
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
["number", "object"],
None,
[0.25, 0.75],
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
None,
["number", "object"],
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
["object", "datetime", "bool"],
None,
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
],
)
def test_describe(include, exclude, percentiles, subset):
......@@ -1047,7 +1097,7 @@ def test_value_counts_with_dropna():
result = ddf.x.value_counts(dropna=False)
expected = df.x.value_counts(dropna=False)
assert_eq(result, expected)
result2 = ddf.x.value_counts(split_every=2)
result2 = ddf.x.value_counts(split_every=2, dropna=False)
assert_eq(result2, expected)
assert result._name != result2._name
......@@ -1095,6 +1145,14 @@ def test_isin():
d.isin(obj)
def test_contains_frame():
df = dd.from_pandas(pd.DataFrame({"A": [1, 2], 0: [3, 4]}), 1)
assert "A" in df
assert 0 in df
assert "B" not in df
assert 1 not in df
def test_len():
assert len(d) == len(full)
assert len(d.a) == len(full.a)
......@@ -1836,7 +1894,7 @@ def test_repartition_npartitions(use_index, n, k, dtype, transform):
)
df = transform(df)
a = dd.from_pandas(df, npartitions=n, sort=use_index)
b = a.repartition(npartitions=k)
b = a.repartition(k)
assert_eq(a, b)
assert b.npartitions == k
parts = dask.get(b.dask, b.__dask_keys__())
......@@ -1861,19 +1919,11 @@ def test_repartition_partition_size(use_index, n, partition_size, transform):
assert all(map(len, parts))
def test_iter_chunks():
sizes = [14, 8, 5, 9, 7, 9, 1, 19, 8, 19]
assert list(iter_chunks(sizes, 19)) == [
[14],
[8, 5],
[9, 7],
[9, 1],
[19],
[8],
[19],
]
assert list(iter_chunks(sizes, 28)) == [[14, 8, 5], [9, 7, 9, 1], [19, 8], [19]]
assert list(iter_chunks(sizes, 67)) == [[14, 8, 5, 9, 7, 9, 1], [19, 8, 19]]
def test_repartition_partition_size_arg():
df = pd.DataFrame({"x": range(10)})
a = dd.from_pandas(df, npartitions=2)
b = a.repartition("1 MiB")
assert b.npartitions == 1
def test_repartition_npartitions_same_limits():
......@@ -2522,15 +2572,17 @@ def test_to_timestamp():
index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004")
df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}, index=index)
ddf = dd.from_pandas(df, npartitions=3)
assert_eq(ddf.to_timestamp(), df.to_timestamp())
assert_eq(ddf.to_timestamp(), df.to_timestamp(), **CHECK_FREQ)
assert_eq(
ddf.to_timestamp(freq="M", how="s").compute(),
df.to_timestamp(freq="M", how="s"),
**CHECK_FREQ
)
assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp())
assert_eq(
ddf.x.to_timestamp(freq="M", how="s").compute(),
df.x.to_timestamp(freq="M", how="s"),
**CHECK_FREQ
)
......@@ -3004,6 +3056,22 @@ def test_dataframe_itertuples():
assert a == b
@pytest.mark.parametrize(
"columns",
[
("x", "y"),
("x", "x"),
pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=("letter", "number")),
],
)
def test_dataframe_items(columns):
df = pd.DataFrame([[1, 10], [2, 20], [3, 30], [4, 40]], columns=columns)
ddf = dd.from_pandas(df, npartitions=2)
for (a, b) in zip(df.items(), ddf.items()):
assert a[0] == b[0] # column name
assert_eq(a[1], b[1].compute()) # column values
def test_dataframe_itertuples_with_index_false():
df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
ddf = dd.from_pandas(df, npartitions=2)
......@@ -3968,6 +4036,7 @@ def test_map_partition_array(func):
assert x.chunks[0] == (np.nan, np.nan)
@pytest.mark.xfail(_numpy_120, reason="sparse-383")
def test_map_partition_sparse():
sparse = pytest.importorskip("sparse")
# Aviod searchsorted failure.
......@@ -4308,3 +4377,16 @@ def test_dataframe_groupby_agg_empty_partitions():
df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6, 7, 8]})
ddf = dd.from_pandas(df, npartitions=4)
assert_eq(ddf[ddf.x < 5].x.cumsum(), df[df.x < 5].x.cumsum())
def test_fuse_roots():
pdf1 = pd.DataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [3, 5, 2, 5, 7, 2, 4, 2, 4]}
)
ddf1 = dd.from_pandas(pdf1, 2)
pdf2 = pd.DataFrame({"a": [True, False, True] * 3, "b": [False, False, True] * 3})
ddf2 = dd.from_pandas(pdf2, 2)
res = ddf1.where(ddf2)
hlg = fuse_roots(res.__dask_graph__(), keys=res.__dask_keys__())
hlg.validate()
......@@ -41,7 +41,11 @@ def test_reduction():
dser = dd.from_pandas(ser, 2)
assert_eq(ser.mean(skipna=False), dser.mean(skipna=False))
assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False))
# It's unclear whether this can be reliably provided, at least with the current
# implementation, which uses pandas.DataFrame.sum(), returning a (homogenous)
# series which has potentially cast values.
# assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False))
def test_scalar():
......
......@@ -19,6 +19,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
CHECK_FREQ["check_freq"] = False
def test_loc():
......@@ -369,24 +372,35 @@ def test_loc_timestamp_str():
assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"])
# same reso, dask result is always DataFrame
assert_eq(df.loc["2011-01-02 10:00"].to_frame().T, ddf.loc["2011-01-02 10:00"])
assert_eq(
df.loc["2011-01-02 10:00"].to_frame().T,
ddf.loc["2011-01-02 10:00"],
**CHECK_FREQ
)
# series
assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"])
assert_eq(df.A.loc["2011-01-02":"2011-01-10"], ddf.A.loc["2011-01-02":"2011-01-10"])
assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"], **CHECK_FREQ)
assert_eq(
df.A.loc["2011-01-02":"2011-01-10"],
ddf.A.loc["2011-01-02":"2011-01-10"],
**CHECK_FREQ
)
# slice with timestamp (dask result must be DataFrame)
assert_eq(
df.loc[pd.Timestamp("2011-01-02")].to_frame().T,
ddf.loc[pd.Timestamp("2011-01-02")],
**CHECK_FREQ
)
assert_eq(
df.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
ddf.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
**CHECK_FREQ
)
assert_eq(
df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T,
ddf.loc[pd.Timestamp("2011-01-02 10:00")],
**CHECK_FREQ
)
df = pd.DataFrame(
......
......@@ -170,19 +170,21 @@ def test_get_dummies_errors():
dd.get_dummies(ddf.x)
@pytest.mark.parametrize("values", ["B", ["B"], ["B", "D"]])
@pytest.mark.parametrize("aggfunc", ["mean", "sum", "count"])
def test_pivot_table(aggfunc):
def test_pivot_table(values, aggfunc):
df = pd.DataFrame(
{
"A": np.random.choice(list("XYZ"), size=100),
"B": np.random.randn(100),
"C": pd.Categorical(np.random.choice(list("abc"), size=100)),
"D": np.random.randn(100),
}
)
ddf = dd.from_pandas(df, 5)
res = dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc=aggfunc)
exp = pd.pivot_table(df, index="A", columns="C", values="B", aggfunc=aggfunc)
res = dd.pivot_table(ddf, index="A", columns="C", values=values, aggfunc=aggfunc)
exp = pd.pivot_table(df, index="A", columns="C", values=values, aggfunc=aggfunc)
if aggfunc == "count":
# dask result cannot be int64 dtype depending on divisions because of NaN
exp = exp.astype(np.float64)
......@@ -190,8 +192,8 @@ def test_pivot_table(aggfunc):
assert_eq(res, exp)
# method
res = ddf.pivot_table(index="A", columns="C", values="B", aggfunc=aggfunc)
exp = df.pivot_table(index="A", columns="C", values="B", aggfunc=aggfunc)
res = ddf.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
exp = df.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
if aggfunc == "count":
# dask result cannot be int64 dtype depending on divisions because of NaN
exp = exp.astype(np.float64)
......@@ -249,9 +251,9 @@ def test_pivot_table_errors():
with pytest.raises(ValueError) as err:
dd.pivot_table(ddf, index="A", columns=["C"], values="B")
assert msg in str(err.value)
msg = "'values' must be the name of an existing column"
msg = "'values' must refer to an existing column or columns"
with pytest.raises(ValueError) as err:
dd.pivot_table(ddf, index="A", columns="C", values=["B"])
dd.pivot_table(ddf, index="A", columns="C", values=[["B"]])
assert msg in str(err.value)
msg = "aggfunc must be either 'mean', 'sum' or 'count'"
......
......@@ -4,6 +4,7 @@ import pandas as pd
import pytest
import numpy as np
import dask.array as da
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq, PANDAS_VERSION
......@@ -139,6 +140,10 @@ rolling_method_args_check_less_precise = [
@pytest.mark.parametrize("window", [1, 2, 4, 5])
@pytest.mark.parametrize("center", [True, False])
def test_rolling_methods(method, args, window, center, check_less_precise):
if dd._compat.PANDAS_GT_110:
check_less_precise = {}
else:
check_less_precise = {"check_less_precise": check_less_precise}
# DataFrame
prolling = df.rolling(window, center=center)
drolling = ddf.rolling(window, center=center)
......@@ -150,7 +155,7 @@ def test_rolling_methods(method, args, window, center, check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise,
**check_less_precise,
)
# Series
......@@ -159,7 +164,7 @@ def test_rolling_methods(method, args, window, center, check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise,
**check_less_precise,
)
......@@ -264,6 +269,14 @@ def test_time_rolling_constructor():
)
@pytest.mark.parametrize("window", ["1S", "2S", "3S", pd.offsets.Second(5)])
def test_time_rolling_methods(method, args, window, check_less_precise):
if dd._compat.PANDAS_GT_110:
if check_less_precise:
check_less_precise = {"atol": 0.5e-3, "rtol": 0.5e-3}
else:
check_less_precise = {}
else:
check_less_precise = {"check_less_precise": check_less_precise}
# DataFrame
if method == "apply":
kwargs = {"raw": False}
......@@ -274,7 +287,7 @@ def test_time_rolling_methods(method, args, window, check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise,
**check_less_precise,
)
# Series
......@@ -283,7 +296,7 @@ def test_time_rolling_methods(method, args, window, check_less_precise):
assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise,
**check_less_precise,
)
......@@ -378,6 +391,7 @@ def test_rolling_agg_aggregate():
@pytest.mark.skipif(not dd._compat.PANDAS_GT_100, reason="needs pandas>=1.0.0")
@pytest.mark.xfail(da.numpy_compat._numpy_120, reason="sparse-383")
def test_rolling_numba_engine():
numba = pytest.importorskip("numba")
if not dd._compat.PANDAS_GT_104 and LooseVersion(numba.__version__) >= "0.49":
......
......@@ -27,7 +27,6 @@ from dask.dataframe.shuffle import (
)
from dask.dataframe.utils import assert_eq, make_meta
dsk = {
("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [1, 4, 7]}, index=[0, 1, 3]),
("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [2, 5, 8]}, index=[5, 6, 8]),
......@@ -36,6 +35,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
CHECK_FREQ["check_freq"] = False
shuffle_func = shuffle # conflicts with keyword argument
......@@ -772,7 +774,7 @@ def test_set_index_on_empty():
ddf = ddf[ddf.y > df.y.max()].set_index("x")
expected_df = df[df.y > df.y.max()].set_index("x")
assert assert_eq(ddf, expected_df)
assert assert_eq(ddf, expected_df, **CHECK_FREQ)
assert ddf.npartitions == 1
......@@ -916,8 +918,8 @@ def test_set_index_timestamp():
assert ts1.value == ts2.value
assert ts1.tz == ts2.tz
assert_eq(df2, ddf_new_div)
assert_eq(df2, ddf.set_index("A"))
assert_eq(df2, ddf_new_div, **CHECK_FREQ)
assert_eq(df2, ddf.set_index("A"), **CHECK_FREQ)
@pytest.mark.parametrize("compression", [None, "ZLib"])
......
......@@ -17,6 +17,7 @@ from dask.dataframe.utils import (
is_series_like,
is_index_like,
PANDAS_GT_0240,
PANDAS_GT_100,
)
import pytest
......@@ -129,7 +130,7 @@ def test_meta_nonempty():
"E": np.int32(1),
"F": pd.Timestamp("2016-01-01"),
"G": pd.date_range("2016-01-01", periods=3, tz="America/New_York"),
"H": pd.Timedelta("1 hours", "ms"),
"H": pd.Timedelta("1 hours"),
"I": np.void(b" "),
"J": pd.Categorical([UNKNOWN_CATEGORIES] * 3),
},
......@@ -147,7 +148,7 @@ def test_meta_nonempty():
assert df3["E"][0].dtype == "i4"
assert df3["F"][0] == pd.Timestamp("1970-01-01 00:00:00")
assert df3["G"][0] == pd.Timestamp("1970-01-01 00:00:00", tz="America/New_York")
assert df3["H"][0] == pd.Timedelta("1", "ms")
assert df3["H"][0] == pd.Timedelta("1")
assert df3["I"][0] == "foo"
assert df3["J"][0] == UNKNOWN_CATEGORIES
......@@ -361,9 +362,9 @@ def test_check_meta():
"+--------+----------+----------+\n"
"| Column | Found | Expected |\n"
"+--------+----------+----------+\n"
"| a | object | category |\n"
"| c | - | float64 |\n"
"| e | category | - |\n"
"| 'a' | object | category |\n"
"| 'c' | - | float64 |\n"
"| 'e' | category | - |\n"
"+--------+----------+----------+"
)
assert str(err.value) == exp
......@@ -441,3 +442,12 @@ def test_apply_and_enforce_message():
with pytest.raises(ValueError, match=re.escape("Missing: ['D']")):
apply_and_enforce(_func=func, _meta=meta)
@pytest.mark.skipif(not PANDAS_GT_100, reason="Only pandas>1")
def test_nonempty_series_sparse():
ser = pd.Series(pd.array([0, 1], dtype="Sparse"))
with pytest.warns(None) as w:
dd.utils._nonempty_series(ser)
assert len(w) == 0
......@@ -7,6 +7,7 @@ from ...base import tokenize
from ...utils import derived_from
from ...highlevelgraph import HighLevelGraph
from .._compat import PANDAS_GT_0240
from .. import methods
def getnanos(rule):
......@@ -75,8 +76,8 @@ def _resample_bin_and_out_divs(divisions, rule, closed="left", label="left"):
else:
outdivs = tempdivs
newdivs = newdivs.tolist()
outdivs = outdivs.tolist()
newdivs = methods.tolist(newdivs)
outdivs = methods.tolist(outdivs)
# Adjust ends
if newdivs[0] < divisions[0]:
......@@ -96,7 +97,7 @@ def _resample_bin_and_out_divs(divisions, rule, closed="left", label="left"):
class Resampler(object):
""" Class for resampling timeseries data.
"""Class for resampling timeseries data.
This class is commonly encountered when using ``obj.resample(...)`` which
return ``Resampler`` objects.
......@@ -128,7 +129,7 @@ class Resampler(object):
self._kwargs = kwargs
def _agg(self, how, meta=None, fill_value=np.nan, how_args=(), how_kwargs={}):
""" Aggregate using one or more operations
"""Aggregate using one or more operations
Parameters
----------
......
......@@ -7,6 +7,10 @@ from dask.dataframe.utils import assert_eq, PANDAS_VERSION
from dask.dataframe._compat import PANDAS_GT_0240
import dask.dataframe as dd
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
CHECK_FREQ["check_freq"] = False
def resample(df, freq, how="mean", **kwargs):
return getattr(df.resample(freq, **kwargs), how)()
......@@ -195,7 +199,7 @@ def test_series_resample_non_existent_datetime():
result = ddf.resample("1D").mean()
expected = df.resample("1D").mean()
assert_eq(result, expected)
assert_eq(result, expected, **CHECK_FREQ)