Commit b41c932e authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Redo lost changes

parent 8d4ead19
...@@ -6,6 +6,7 @@ from . import methods ...@@ -6,6 +6,7 @@ from . import methods
from .utils import is_categorical_dtype, is_scalar, has_known_categories from .utils import is_categorical_dtype, is_scalar, has_known_categories
from ..utils import M from ..utils import M
import sys import sys
from pandas.api.types import is_list_like
############################################################### ###############################################################
# Dummies # Dummies
...@@ -186,7 +187,8 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"): ...@@ -186,7 +187,8 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
""" """
Create a spreadsheet-style pivot table as a DataFrame. Target ``columns`` Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
must have category dtype to infer result's ``columns``. must have category dtype to infer result's ``columns``.
``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar. ``index``, ``columns``, and ``aggfunc`` must be all scalar.
``values`` can be scalar or list-like.
Parameters Parameters
---------- ----------
...@@ -195,8 +197,8 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"): ...@@ -195,8 +197,8 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
column to be index column to be index
columns : scalar columns : scalar
column to be columns column to be columns
values : scalar values : scalar or list(scalar)
column to aggregate column(s) to aggregate
aggfunc : {'mean', 'sum', 'count'}, default 'mean' aggfunc : {'mean', 'sum', 'count'}, default 'mean'
Returns Returns
...@@ -220,14 +222,26 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"): ...@@ -220,14 +222,26 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
"`df[columns].cat.as_known()` beforehand to ensure " "`df[columns].cat.as_known()` beforehand to ensure "
"known categories" "known categories"
) )
if not is_scalar(values) or values is None: if not (
raise ValueError("'values' must be the name of an existing column") is_list_like(values)
and all([is_scalar(v) for v in values])
or is_scalar(values)
):
raise ValueError("'values' must refer to an existing column or columns")
if not is_scalar(aggfunc) or aggfunc not in ("mean", "sum", "count"): if not is_scalar(aggfunc) or aggfunc not in ("mean", "sum", "count"):
raise ValueError("aggfunc must be either 'mean', 'sum' or 'count'") raise ValueError("aggfunc must be either 'mean', 'sum' or 'count'")
# _emulate can't work for empty data # _emulate can't work for empty data
# the result must have CategoricalIndex columns # the result must have CategoricalIndex columns
new_columns = pd.CategoricalIndex(df[columns].cat.categories, name=columns)
columns_contents = pd.CategoricalIndex(df[columns].cat.categories, name=columns)
if is_scalar(values):
new_columns = columns_contents
else:
new_columns = pd.MultiIndex.from_product(
(sorted(values), columns_contents), names=[None, columns]
)
meta = pd.DataFrame( meta = pd.DataFrame(
columns=new_columns, dtype=np.float64, index=pd.Index(df._meta[index]) columns=new_columns, dtype=np.float64, index=pd.Index(df._meta[index])
) )
......
...@@ -84,7 +84,7 @@ def set_index( ...@@ -84,7 +84,7 @@ def set_index(
divisions, sizes, mins, maxes = base.compute( divisions, sizes, mins, maxes = base.compute(
divisions, sizes, mins, maxes, optimize_graph=False divisions, sizes, mins, maxes, optimize_graph=False
) )
divisions = divisions.tolist() divisions = methods.tolist(divisions)
empty_dataframe_detected = pd.isnull(divisions).all() empty_dataframe_detected = pd.isnull(divisions).all()
if repartition or empty_dataframe_detected: if repartition or empty_dataframe_detected:
...@@ -124,7 +124,7 @@ def set_index( ...@@ -124,7 +124,7 @@ def set_index(
def remove_nans(divisions): def remove_nans(divisions):
""" Remove nans from divisions """Remove nans from divisions
These sometime pop up when we call min/max on an empty partition These sometime pop up when we call min/max on an empty partition
...@@ -155,7 +155,7 @@ def remove_nans(divisions): ...@@ -155,7 +155,7 @@ def remove_nans(divisions):
def set_partition( def set_partition(
df, index, divisions, max_branch=32, drop=True, shuffle=None, compute=None df, index, divisions, max_branch=32, drop=True, shuffle=None, compute=None
): ):
""" Group DataFrame by index """Group DataFrame by index
Sets a new index and partitions data along that index according to Sets a new index and partitions data along that index according to
divisions. Divisions are often found by computing approximate quantiles. divisions. Divisions are often found by computing approximate quantiles.
...@@ -238,7 +238,7 @@ def set_partition( ...@@ -238,7 +238,7 @@ def set_partition(
column_dtype=df.columns.dtype, column_dtype=df.columns.dtype,
) )
df4.divisions = divisions.tolist() df4.divisions = methods.tolist(divisions)
return df4.map_partitions(M.sort_index) return df4.map_partitions(M.sort_index)
...@@ -252,7 +252,7 @@ def shuffle( ...@@ -252,7 +252,7 @@ def shuffle(
ignore_index=False, ignore_index=False,
compute=None, compute=None,
): ):
""" Group DataFrame by index """Group DataFrame by index
Hash grouping of elements. After this operation all elements that have Hash grouping of elements. After this operation all elements that have
the same index will be in the same partition. Note that this requires the same index will be in the same partition. Note that this requires
...@@ -404,7 +404,7 @@ class maybe_buffered_partd(object): ...@@ -404,7 +404,7 @@ class maybe_buffered_partd(object):
def rearrange_by_column_disk(df, column, npartitions=None, compute=False): def rearrange_by_column_disk(df, column, npartitions=None, compute=False):
""" Shuffle using local disk """Shuffle using local disk
See Also See Also
-------- --------
...@@ -472,8 +472,7 @@ def _noop(x, cleanup_token): ...@@ -472,8 +472,7 @@ def _noop(x, cleanup_token):
def _simple_rearrange_by_column_tasks(df, column, npartitions, ignore_index=False): def _simple_rearrange_by_column_tasks(df, column, npartitions, ignore_index=False):
""" A simplified (single-stage) version of ``rearrange_by_column_tasks``. """A simplified (single-stage) version of ``rearrange_by_column_tasks``."""
"""
token = tokenize(df, column) token = tokenize(df, column)
simple_shuffle_group_token = "simple-shuffle-group-" + token simple_shuffle_group_token = "simple-shuffle-group-" + token
...@@ -534,7 +533,7 @@ def _simple_rearrange_by_column_tasks(df, column, npartitions, ignore_index=Fals ...@@ -534,7 +533,7 @@ def _simple_rearrange_by_column_tasks(df, column, npartitions, ignore_index=Fals
def rearrange_by_column_tasks( def rearrange_by_column_tasks(
df, column, max_branch=32, npartitions=None, ignore_index=False df, column, max_branch=32, npartitions=None, ignore_index=False
): ):
""" Order divisions of DataFrame so that all values within column(s) align """Order divisions of DataFrame so that all values within column(s) align
This enacts a task-based shuffle. It contains most of the tricky logic This enacts a task-based shuffle. It contains most of the tricky logic
around the complex network of tasks. Typically before this function is around the complex network of tasks. Typically before this function is
...@@ -816,7 +815,7 @@ def shuffle_group_get(g_head, i): ...@@ -816,7 +815,7 @@ def shuffle_group_get(g_head, i):
def shuffle_group(df, cols, stage, k, npartitions, ignore_index, nfinal): def shuffle_group(df, cols, stage, k, npartitions, ignore_index, nfinal):
""" Splits dataframe into groups """Splits dataframe into groups
The group is determined by their final partition, and which stage we are in The group is determined by their final partition, and which stage we are in
in the shuffle in the shuffle
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment