Commit b41c932e authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Redo lost changes

parent 8d4ead19
...@@ -9,8 +9,9 @@ from pandas.io.formats import format as pandas_format ...@@ -9,8 +9,9 @@ from pandas.io.formats import format as pandas_format
import dask import dask
import dask.array as da import dask.array as da
from dask.array.numpy_compat import _numpy_118 from dask.array.numpy_compat import _numpy_118, _numpy_120
import dask.dataframe as dd import dask.dataframe as dd
from dask.blockwise import fuse_roots
from dask.dataframe import _compat from dask.dataframe import _compat
from dask.dataframe._compat import tm, PANDAS_GT_100, PANDAS_GT_110 from dask.dataframe._compat import tm, PANDAS_GT_100, PANDAS_GT_110
from dask.base import compute_as_if_collection from dask.base import compute_as_if_collection
...@@ -22,7 +23,6 @@ from dask.dataframe.core import ( ...@@ -22,7 +23,6 @@ from dask.dataframe.core import (
_concat, _concat,
Scalar, Scalar,
has_parallel_type, has_parallel_type,
iter_chunks,
total_mem_usage, total_mem_usage,
is_broadcastable, is_broadcastable,
) )
...@@ -37,6 +37,9 @@ dsk = { ...@@ -37,6 +37,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute() full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
CHECK_FREQ["check_freq"] = False
def test_dataframe_doc(): def test_dataframe_doc():
...@@ -222,7 +225,18 @@ def test_index_names(): ...@@ -222,7 +225,18 @@ def test_index_names():
assert ddf.index.compute().name == "x" assert ddf.index.compute().name == "x"
@pytest.mark.parametrize("npartitions", [1, pytest.param(2, marks=pytest.mark.xfail)]) @pytest.mark.parametrize(
"npartitions",
[
1,
pytest.param(
2,
marks=pytest.mark.xfail(
not dd._compat.PANDAS_GT_110, reason="Fixed upstream."
),
),
],
)
def test_timezone_freq(npartitions): def test_timezone_freq(npartitions):
s_naive = pd.Series(pd.date_range("20130101", periods=10)) s_naive = pd.Series(pd.date_range("20130101", periods=10))
s_aware = pd.Series(pd.date_range("20130101", periods=10, tz="US/Eastern")) s_aware = pd.Series(pd.date_range("20130101", periods=10, tz="US/Eastern"))
...@@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values): ...@@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values):
(None, None, None, ["c", "d", "g"]), # numeric + bool (None, None, None, ["c", "d", "g"]), # numeric + bool
(None, None, None, ["c", "d", "f", "g"]), # numeric + bool + timedelta (None, None, None, ["c", "d", "f", "g"]), # numeric + bool + timedelta
(None, None, None, ["f", "g"]), # bool + timedelta (None, None, None, ["f", "g"]), # bool + timedelta
("all", None, None, None), pytest.param(
(["number"], None, [0.25, 0.5], None), "all",
([np.timedelta64], None, None, None), None,
(["number", "object"], None, [0.25, 0.75], None), None,
(None, ["number", "object"], None, None), None,
(["object", "datetime", "bool"], None, None, None), marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
["number"],
None,
[0.25, 0.5],
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
[np.timedelta64],
None,
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
["number", "object"],
None,
[0.25, 0.75],
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
None,
["number", "object"],
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
pytest.param(
["object", "datetime", "bool"],
None,
None,
None,
marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"),
),
], ],
) )
def test_describe(include, exclude, percentiles, subset): def test_describe(include, exclude, percentiles, subset):
...@@ -1047,7 +1097,7 @@ def test_value_counts_with_dropna(): ...@@ -1047,7 +1097,7 @@ def test_value_counts_with_dropna():
result = ddf.x.value_counts(dropna=False) result = ddf.x.value_counts(dropna=False)
expected = df.x.value_counts(dropna=False) expected = df.x.value_counts(dropna=False)
assert_eq(result, expected) assert_eq(result, expected)
result2 = ddf.x.value_counts(split_every=2) result2 = ddf.x.value_counts(split_every=2, dropna=False)
assert_eq(result2, expected) assert_eq(result2, expected)
assert result._name != result2._name assert result._name != result2._name
...@@ -1095,6 +1145,14 @@ def test_isin(): ...@@ -1095,6 +1145,14 @@ def test_isin():
d.isin(obj) d.isin(obj)
def test_contains_frame():
df = dd.from_pandas(pd.DataFrame({"A": [1, 2], 0: [3, 4]}), 1)
assert "A" in df
assert 0 in df
assert "B" not in df
assert 1 not in df
def test_len(): def test_len():
assert len(d) == len(full) assert len(d) == len(full)
assert len(d.a) == len(full.a) assert len(d.a) == len(full.a)
...@@ -1836,7 +1894,7 @@ def test_repartition_npartitions(use_index, n, k, dtype, transform): ...@@ -1836,7 +1894,7 @@ def test_repartition_npartitions(use_index, n, k, dtype, transform):
) )
df = transform(df) df = transform(df)
a = dd.from_pandas(df, npartitions=n, sort=use_index) a = dd.from_pandas(df, npartitions=n, sort=use_index)
b = a.repartition(npartitions=k) b = a.repartition(k)
assert_eq(a, b) assert_eq(a, b)
assert b.npartitions == k assert b.npartitions == k
parts = dask.get(b.dask, b.__dask_keys__()) parts = dask.get(b.dask, b.__dask_keys__())
...@@ -1861,19 +1919,11 @@ def test_repartition_partition_size(use_index, n, partition_size, transform): ...@@ -1861,19 +1919,11 @@ def test_repartition_partition_size(use_index, n, partition_size, transform):
assert all(map(len, parts)) assert all(map(len, parts))
def test_iter_chunks(): def test_repartition_partition_size_arg():
sizes = [14, 8, 5, 9, 7, 9, 1, 19, 8, 19] df = pd.DataFrame({"x": range(10)})
assert list(iter_chunks(sizes, 19)) == [ a = dd.from_pandas(df, npartitions=2)
[14], b = a.repartition("1 MiB")
[8, 5], assert b.npartitions == 1
[9, 7],
[9, 1],
[19],
[8],
[19],
]
assert list(iter_chunks(sizes, 28)) == [[14, 8, 5], [9, 7, 9, 1], [19, 8], [19]]
assert list(iter_chunks(sizes, 67)) == [[14, 8, 5, 9, 7, 9, 1], [19, 8, 19]]
def test_repartition_npartitions_same_limits(): def test_repartition_npartitions_same_limits():
...@@ -2522,15 +2572,17 @@ def test_to_timestamp(): ...@@ -2522,15 +2572,17 @@ def test_to_timestamp():
index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004") index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004")
df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}, index=index) df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}, index=index)
ddf = dd.from_pandas(df, npartitions=3) ddf = dd.from_pandas(df, npartitions=3)
assert_eq(ddf.to_timestamp(), df.to_timestamp()) assert_eq(ddf.to_timestamp(), df.to_timestamp(), **CHECK_FREQ)
assert_eq( assert_eq(
ddf.to_timestamp(freq="M", how="s").compute(), ddf.to_timestamp(freq="M", how="s").compute(),
df.to_timestamp(freq="M", how="s"), df.to_timestamp(freq="M", how="s"),
**CHECK_FREQ
) )
assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp()) assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp())
assert_eq( assert_eq(
ddf.x.to_timestamp(freq="M", how="s").compute(), ddf.x.to_timestamp(freq="M", how="s").compute(),
df.x.to_timestamp(freq="M", how="s"), df.x.to_timestamp(freq="M", how="s"),
**CHECK_FREQ
) )
...@@ -3004,6 +3056,22 @@ def test_dataframe_itertuples(): ...@@ -3004,6 +3056,22 @@ def test_dataframe_itertuples():
assert a == b assert a == b
@pytest.mark.parametrize(
"columns",
[
("x", "y"),
("x", "x"),
pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=("letter", "number")),
],
)
def test_dataframe_items(columns):
df = pd.DataFrame([[1, 10], [2, 20], [3, 30], [4, 40]], columns=columns)
ddf = dd.from_pandas(df, npartitions=2)
for (a, b) in zip(df.items(), ddf.items()):
assert a[0] == b[0] # column name
assert_eq(a[1], b[1].compute()) # column values
def test_dataframe_itertuples_with_index_false(): def test_dataframe_itertuples_with_index_false():
df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}) df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
ddf = dd.from_pandas(df, npartitions=2) ddf = dd.from_pandas(df, npartitions=2)
...@@ -3968,6 +4036,7 @@ def test_map_partition_array(func): ...@@ -3968,6 +4036,7 @@ def test_map_partition_array(func):
assert x.chunks[0] == (np.nan, np.nan) assert x.chunks[0] == (np.nan, np.nan)
@pytest.mark.xfail(_numpy_120, reason="sparse-383")
def test_map_partition_sparse(): def test_map_partition_sparse():
sparse = pytest.importorskip("sparse") sparse = pytest.importorskip("sparse")
# Aviod searchsorted failure. # Aviod searchsorted failure.
...@@ -4308,3 +4377,16 @@ def test_dataframe_groupby_agg_empty_partitions(): ...@@ -4308,3 +4377,16 @@ def test_dataframe_groupby_agg_empty_partitions():
df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6, 7, 8]}) df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6, 7, 8]})
ddf = dd.from_pandas(df, npartitions=4) ddf = dd.from_pandas(df, npartitions=4)
assert_eq(ddf[ddf.x < 5].x.cumsum(), df[df.x < 5].x.cumsum()) assert_eq(ddf[ddf.x < 5].x.cumsum(), df[df.x < 5].x.cumsum())
def test_fuse_roots():
pdf1 = pd.DataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [3, 5, 2, 5, 7, 2, 4, 2, 4]}
)
ddf1 = dd.from_pandas(pdf1, 2)
pdf2 = pd.DataFrame({"a": [True, False, True] * 3, "b": [False, False, True] * 3})
ddf2 = dd.from_pandas(pdf2, 2)
res = ddf1.where(ddf2)
hlg = fuse_roots(res.__dask_graph__(), keys=res.__dask_keys__())
hlg.validate()
...@@ -41,7 +41,11 @@ def test_reduction(): ...@@ -41,7 +41,11 @@ def test_reduction():
dser = dd.from_pandas(ser, 2) dser = dd.from_pandas(ser, 2)
assert_eq(ser.mean(skipna=False), dser.mean(skipna=False)) assert_eq(ser.mean(skipna=False), dser.mean(skipna=False))
assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False)) # It's unclear whether this can be reliably provided, at least with the current
# implementation, which uses pandas.DataFrame.sum(), returning a (homogenous)
# series which has potentially cast values.
# assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False))
def test_scalar(): def test_scalar():
......
...@@ -19,6 +19,9 @@ dsk = { ...@@ -19,6 +19,9 @@ dsk = {
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute() full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
CHECK_FREQ["check_freq"] = False
def test_loc(): def test_loc():
...@@ -369,24 +372,35 @@ def test_loc_timestamp_str(): ...@@ -369,24 +372,35 @@ def test_loc_timestamp_str():
assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"]) assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"]) assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"])
# same reso, dask result is always DataFrame # same reso, dask result is always DataFrame
assert_eq(df.loc["2011-01-02 10:00"].to_frame().T, ddf.loc["2011-01-02 10:00"]) assert_eq(
df.loc["2011-01-02 10:00"].to_frame().T,
ddf.loc["2011-01-02 10:00"],
**CHECK_FREQ
)
# series # series
assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"]) assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"], **CHECK_FREQ)
assert_eq(df.A.loc["2011-01-02":"2011-01-10"], ddf.A.loc["2011-01-02":"2011-01-10"]) assert_eq(
df.A.loc["2011-01-02":"2011-01-10"],
ddf.A.loc["2011-01-02":"2011-01-10"],
**CHECK_FREQ
)
# slice with timestamp (dask result must be DataFrame) # slice with timestamp (dask result must be DataFrame)
assert_eq( assert_eq(
df.loc[pd.Timestamp("2011-01-02")].to_frame().T, df.loc[pd.Timestamp("2011-01-02")].to_frame().T,
ddf.loc[pd.Timestamp("2011-01-02")], ddf.loc[pd.Timestamp("2011-01-02")],
**CHECK_FREQ
) )
assert_eq( assert_eq(
df.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")], df.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
ddf.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")], ddf.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
**CHECK_FREQ
) )
assert_eq( assert_eq(
df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T, df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T,
ddf.loc[pd.Timestamp("2011-01-02 10:00")], ddf.loc[pd.Timestamp("2011-01-02 10:00")],
**CHECK_FREQ
) )
df = pd.DataFrame( df = pd.DataFrame(
......
...@@ -170,19 +170,21 @@ def test_get_dummies_errors(): ...@@ -170,19 +170,21 @@ def test_get_dummies_errors():
dd.get_dummies(ddf.x) dd.get_dummies(ddf.x)
@pytest.mark.parametrize("values", ["B", ["B"], ["B", "D"]])
@pytest.mark.parametrize("aggfunc", ["mean", "sum", "count"]) @pytest.mark.parametrize("aggfunc", ["mean", "sum", "count"])
def test_pivot_table(aggfunc): def test_pivot_table(values, aggfunc):
df = pd.DataFrame( df = pd.DataFrame(
{ {
"A": np.random.choice(list("XYZ"), size=100), "A": np.random.choice(list("XYZ"), size=100),
"B": np.random.randn(100), "B": np.random.randn(100),
"C": pd.Categorical(np.random.choice(list("abc"), size=100)), "C": pd.Categorical(np.random.choice(list("abc"), size=100)),
"D": np.random.randn(100),
} }
) )
ddf = dd.from_pandas(df, 5) ddf = dd.from_pandas(df, 5)
res = dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc=aggfunc) res = dd.pivot_table(ddf, index="A", columns="C", values=values, aggfunc=aggfunc)
exp = pd.pivot_table(df, index="A", columns="C", values="B", aggfunc=aggfunc) exp = pd.pivot_table(df, index="A", columns="C", values=values, aggfunc=aggfunc)
if aggfunc == "count": if aggfunc == "count":
# dask result cannot be int64 dtype depending on divisions because of NaN # dask result cannot be int64 dtype depending on divisions because of NaN
exp = exp.astype(np.float64) exp = exp.astype(np.float64)
...@@ -190,8 +192,8 @@ def test_pivot_table(aggfunc): ...@@ -190,8 +192,8 @@ def test_pivot_table(aggfunc):
assert_eq(res, exp) assert_eq(res, exp)
# method # method
res = ddf.pivot_table(index="A", columns="C", values="B", aggfunc=aggfunc) res = ddf.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
exp = df.pivot_table(index="A", columns="C", values="B", aggfunc=aggfunc) exp = df.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
if aggfunc == "count": if aggfunc == "count":
# dask result cannot be int64 dtype depending on divisions because of NaN # dask result cannot be int64 dtype depending on divisions because of NaN
exp = exp.astype(np.float64) exp = exp.astype(np.float64)
...@@ -249,9 +251,9 @@ def test_pivot_table_errors(): ...@@ -249,9 +251,9 @@ def test_pivot_table_errors():
with pytest.raises(ValueError) as err: with pytest.raises(ValueError) as err:
dd.pivot_table(ddf, index="A", columns=["C"], values="B") dd.pivot_table(ddf, index="A", columns=["C"], values="B")
assert msg in str(err.value) assert msg in str(err.value)
msg = "'values' must be the name of an existing column" msg = "'values' must refer to an existing column or columns"
with pytest.raises(ValueError) as err: with pytest.raises(ValueError) as err:
dd.pivot_table(ddf, index="A", columns="C", values=["B"]) dd.pivot_table(ddf, index="A", columns="C", values=[["B"]])
assert msg in str(err.value) assert msg in str(err.value)
msg = "aggfunc must be either 'mean', 'sum' or 'count'" msg = "aggfunc must be either 'mean', 'sum' or 'count'"
......
...@@ -4,6 +4,7 @@ import pandas as pd ...@@ -4,6 +4,7 @@ import pandas as pd
import pytest import pytest
import numpy as np import numpy as np
import dask.array as da
import dask.dataframe as dd import dask.dataframe as dd
from dask.dataframe.utils import assert_eq, PANDAS_VERSION from dask.dataframe.utils import assert_eq, PANDAS_VERSION
...@@ -139,6 +140,10 @@ rolling_method_args_check_less_precise = [ ...@@ -139,6 +140,10 @@ rolling_method_args_check_less_precise = [
@pytest.mark.parametrize("window", [1, 2, 4, 5]) @pytest.mark.parametrize("window", [1, 2, 4, 5])
@pytest.mark.parametrize("center", [True, False]) @pytest.mark.parametrize("center", [True, False])
def test_rolling_methods(method, args, window, center, check_less_precise): def test_rolling_methods(method, args, window, center, check_less_precise):
if dd._compat.PANDAS_GT_110:
check_less_precise = {}
else:
check_less_precise = {"check_less_precise": check_less_precise}
# DataFrame # DataFrame
prolling = df.rolling(window, center=center) prolling = df.rolling(window, center=center)
drolling = ddf.rolling(window, center=center) drolling = ddf.rolling(window, center=center)
...@@ -150,7 +155,7 @@ def test_rolling_methods(method, args, window, center, check_less_precise): ...@@ -150,7 +155,7 @@ def test_rolling_methods(method, args, window, center, check_less_precise):
assert_eq( assert_eq(
getattr(prolling, method)(*args, **kwargs), getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs), getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise, **check_less_precise,
) )
# Series # Series
...@@ -159,7 +164,7 @@ def test_rolling_methods(method, args, window, center, check_less_precise): ...@@ -159,7 +164,7 @@ def test_rolling_methods(method, args, window, center, check_less_precise):
assert_eq( assert_eq(
getattr(prolling, method)(*args, **kwargs), getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs), getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise, **check_less_precise,
) )
...@@ -264,6 +269,14 @@ def test_time_rolling_constructor(): ...@@ -264,6 +269,14 @@ def test_time_rolling_constructor():
) )
@pytest.mark.parametrize("window", ["1S", "2S", "3S", pd.offsets.Second(5)]) @pytest.mark.parametrize("window", ["1S", "2S", "3S", pd.offsets.Second(5)])
def test_time_rolling_methods(method, args, window, check_less_precise): def test_time_rolling_methods(method, args, window, check_less_precise):
if dd._compat.PANDAS_GT_110:
if check_less_precise:
check_less_precise = {"atol": 0.5e-3, "rtol": 0.5e-3}
else:
check_less_precise = {}
else:
check_less_precise = {"check_less_precise": check_less_precise}
# DataFrame # DataFrame
if method == "apply": if method == "apply":
kwargs = {"raw": False} kwargs = {"raw": False}
...@@ -274,7 +287,7 @@ def test_time_rolling_methods(method, args, window, check_less_precise): ...@@ -274,7 +287,7 @@ def test_time_rolling_methods(method, args, window, check_less_precise):
assert_eq( assert_eq(
getattr(prolling, method)(*args, **kwargs), getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs), getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise, **check_less_precise,
) )
# Series # Series
...@@ -283,7 +296,7 @@ def test_time_rolling_methods(method, args, window, check_less_precise): ...@@ -283,7 +296,7 @@ def test_time_rolling_methods(method, args, window, check_less_precise):
assert_eq( assert_eq(
getattr(prolling, method)(*args, **kwargs), getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs), getattr(drolling, method)(*args, **kwargs),
check_less_precise=check_less_precise, **check_less_precise,
) )