Commit b41c932e authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Redo lost changes

parent 8d4ead19
......@@ -6,6 +6,7 @@ from . import methods
from .utils import is_categorical_dtype, is_scalar, has_known_categories
from ..utils import M
import sys
from pandas.api.types import is_list_like
###############################################################
# Dummies
......@@ -186,7 +187,8 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
"""
Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
must have category dtype to infer result's ``columns``.
``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar.
``index``, ``columns``, and ``aggfunc`` must be all scalar.
``values`` can be scalar or list-like.
Parameters
----------
......@@ -195,8 +197,8 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
column to be index
columns : scalar
column to be columns
values : scalar
column to aggregate
values : scalar or list(scalar)
column(s) to aggregate
aggfunc : {'mean', 'sum', 'count'}, default 'mean'
Returns
......@@ -220,14 +222,26 @@ def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
"`df[columns].cat.as_known()` beforehand to ensure "
"known categories"
)
if not is_scalar(values) or values is None:
raise ValueError("'values' must be the name of an existing column")
if not (
is_list_like(values)
and all([is_scalar(v) for v in values])
or is_scalar(values)
):
raise ValueError("'values' must refer to an existing column or columns")
if not is_scalar(aggfunc) or aggfunc not in ("mean", "sum", "count"):
raise ValueError("aggfunc must be either 'mean', 'sum' or 'count'")
# _emulate can't work for empty data
# the result must have CategoricalIndex columns
new_columns = pd.CategoricalIndex(df[columns].cat.categories, name=columns)
columns_contents = pd.CategoricalIndex(df[columns].cat.categories, name=columns)
if is_scalar(values):
new_columns = columns_contents
else:
new_columns = pd.MultiIndex.from_product(
(sorted(values), columns_contents), names=[None, columns]
)
meta = pd.DataFrame(
columns=new_columns, dtype=np.float64, index=pd.Index(df._meta[index])
)
......
......@@ -84,7 +84,7 @@ def set_index(
divisions, sizes, mins, maxes = base.compute(
divisions, sizes, mins, maxes, optimize_graph=False
)
divisions = divisions.tolist()
divisions = methods.tolist(divisions)
empty_dataframe_detected = pd.isnull(divisions).all()
if repartition or empty_dataframe_detected:
......@@ -124,7 +124,7 @@ def set_index(
def remove_nans(divisions):
""" Remove nans from divisions
"""Remove nans from divisions
These sometime pop up when we call min/max on an empty partition
......@@ -155,7 +155,7 @@ def remove_nans(divisions):
def set_partition(
df, index, divisions, max_branch=32, drop=True, shuffle=None, compute=None
):
""" Group DataFrame by index
"""Group DataFrame by index
Sets a new index and partitions data along that index according to
divisions. Divisions are often found by computing approximate quantiles.
......@@ -238,7 +238,7 @@ def set_partition(
column_dtype=df.columns.dtype,
)
df4.divisions = divisions.tolist()
df4.divisions = methods.tolist(divisions)
return df4.map_partitions(M.sort_index)
......@@ -252,7 +252,7 @@ def shuffle(
ignore_index=False,
compute=None,
):
""" Group DataFrame by index
"""Group DataFrame by index
Hash grouping of elements. After this operation all elements that have
the same index will be in the same partition. Note that this requires
......@@ -404,7 +404,7 @@ class maybe_buffered_partd(object):
def rearrange_by_column_disk(df, column, npartitions=None, compute=False):
""" Shuffle using local disk
"""Shuffle using local disk
See Also
--------
......@@ -472,8 +472,7 @@ def _noop(x, cleanup_token):
def _simple_rearrange_by_column_tasks(df, column, npartitions, ignore_index=False):
""" A simplified (single-stage) version of ``rearrange_by_column_tasks``.
"""
"""A simplified (single-stage) version of ``rearrange_by_column_tasks``."""
token = tokenize(df, column)
simple_shuffle_group_token = "simple-shuffle-group-" + token
......@@ -534,7 +533,7 @@ def _simple_rearrange_by_column_tasks(df, column, npartitions, ignore_index=Fals
def rearrange_by_column_tasks(
df, column, max_branch=32, npartitions=None, ignore_index=False
):
""" Order divisions of DataFrame so that all values within column(s) align
"""Order divisions of DataFrame so that all values within column(s) align
This enacts a task-based shuffle. It contains most of the tricky logic
around the complex network of tasks. Typically before this function is
......@@ -816,7 +815,7 @@ def shuffle_group_get(g_head, i):
def shuffle_group(df, cols, stage, k, npartitions, ignore_index, nfinal):
""" Splits dataframe into groups
"""Splits dataframe into groups
The group is determined by their final partition, and which stage we are in
in the shuffle
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment