Commit b41c932e authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Redo lost changes

parent 8d4ead19
......@@ -94,7 +94,7 @@ class Accessor(object):
class DatetimeAccessor(Accessor):
""" Accessor object for datetimelike properties of the Series values.
"""Accessor object for datetimelike properties of the Series values.
Examples
--------
......@@ -106,7 +106,7 @@ class DatetimeAccessor(Accessor):
class StringAccessor(Accessor):
""" Accessor object for string properties of the Series values.
"""Accessor object for string properties of the Series values.
Examples
--------
......
......@@ -16,7 +16,7 @@ from ..utils import Dispatch
def _categorize_block(df, categories, index):
""" Categorize a dataframe with given categories
"""Categorize a dataframe with given categories
df: DataFrame
categories: dict mapping column name to iterable of categories
......
......@@ -40,6 +40,7 @@ from ..utils import (
OperatorMethodMixin,
is_arraylike,
typename,
iter_chunks,
)
from ..array.core import Array, normalize_arg
from ..array.utils import zeros_like_safe
......@@ -133,7 +134,7 @@ class Scalar(DaskMethodsMixin, OperatorMethodMixin):
return self._name
def __dask_layers__(self):
return (self.key,)
return (self._name,)
__dask_optimize__ = globalmethod(
optimize, key="dataframe_optimize", falsey=dont_optimize
......@@ -268,7 +269,7 @@ def _scalar_binary(op, self, other, inv=False):
class _Frame(DaskMethodsMixin, OperatorMethodMixin):
""" Superclass for DataFrame and Series
"""Superclass for DataFrame and Series
Parameters
----------
......@@ -359,7 +360,7 @@ class _Frame(DaskMethodsMixin, OperatorMethodMixin):
self.dask, self._name, self._meta, self.divisions = state
def copy(self):
""" Make a copy of the dataframe
"""Make a copy of the dataframe
This is strictly a shallow copy of the underlying computational graph.
It does not affect the underlying data
......@@ -562,7 +563,7 @@ Dask Name: {name}, {task} tasks"""
@insert_meta_param_description(pad=12)
def map_partitions(self, func, *args, **kwargs):
""" Apply Python function on each DataFrame partition.
"""Apply Python function on each DataFrame partition.
Note that the index and divisions are assumed to remain unchanged.
......@@ -753,7 +754,7 @@ Dask Name: {name}, {task} tasks"""
return map_overlap(func, self, before, after, *args, **kwargs)
def memory_usage_per_partition(self, index=True, deep=False):
""" Return the memory usage of each partition
"""Return the memory usage of each partition
Parameters
----------
......@@ -936,7 +937,7 @@ Dask Name: {name}, {task} tasks"""
return func(self, *args, **kwargs)
def random_split(self, frac, random_state=None, shuffle=False):
""" Pseudorandomly split dataframe into different pieces row-wise
"""Pseudorandomly split dataframe into different pieces row-wise
Parameters
----------
......@@ -988,7 +989,7 @@ Dask Name: {name}, {task} tasks"""
return out
def head(self, n=5, npartitions=1, compute=True):
""" First n rows of the dataset
"""First n rows of the dataset
Parameters
----------
......@@ -1039,7 +1040,7 @@ Dask Name: {name}, {task} tasks"""
return result
def tail(self, n=5, compute=True):
""" Last n rows of the dataset
"""Last n rows of the dataset
Caveat, the only checks the last n rows of the last partition.
"""
......@@ -1055,7 +1056,7 @@ Dask Name: {name}, {task} tasks"""
@property
def loc(self):
""" Purely label-location based indexer for selection by label.
"""Purely label-location based indexer for selection by label.
>>> df.loc["b"] # doctest: +SKIP
>>> df.loc["b":"d"] # doctest: +SKIP
......@@ -1084,7 +1085,7 @@ Dask Name: {name}, {task} tasks"""
@property
def partitions(self):
""" Slice dataframe by partitions
"""Slice dataframe by partitions
This allows partitionwise slicing of a Dask Dataframe. You can perform normal
Numpy-style slicing but now rather than slice elements of the array you
......@@ -1113,13 +1114,15 @@ Dask Name: {name}, {task} tasks"""
freq=None,
force=False,
):
""" Repartition dataframe along new divisions
"""Repartition dataframe along new divisions
Parameters
----------
divisions : list, optional
List of partitions to be used. Only used if npartitions and
partition_size isn't specified.
For convenience if given an integer this will defer to npartitions
and if given a string it will defer to partition_size (see below)
npartitions : int, optional
Number of partitions of output. Only used if partition_size
isn't specified.
......@@ -1153,6 +1156,12 @@ Dask Name: {name}, {task} tasks"""
>>> df = df.repartition(divisions=[0, 5, 10, 20]) # doctest: +SKIP
>>> df = df.repartition(freq='7d') # doctest: +SKIP
"""
if isinstance(divisions, int):
npartitions = divisions
divisions = None
if isinstance(divisions, str):
partition_size = divisions
divisions = None
if (
sum(
[
......@@ -1187,7 +1196,7 @@ Dask Name: {name}, {task} tasks"""
ignore_index=False,
compute=None,
):
""" Rearrange DataFrame into new partitions
"""Rearrange DataFrame into new partitions
Uses hashing of `on` to map rows to output partitions. After this
operation, rows with the same value of `on` will be in the same
......@@ -1307,7 +1316,7 @@ Dask Name: {name}, {task} tasks"""
return self.fillna(method="bfill", limit=limit, axis=axis)
def sample(self, n=None, frac=None, replace=False, random_state=None):
""" Random sample of items
"""Random sample of items
Parameters
----------
......@@ -2024,7 +2033,7 @@ Dask Name: {name}, {task} tasks"""
return result
def quantile(self, q=0.5, axis=0, method="default"):
""" Approximate row-wise and precise column-wise quantiles of DataFrame
"""Approximate row-wise and precise column-wise quantiles of DataFrame
Parameters
----------
......@@ -2487,7 +2496,7 @@ Dask Name: {name}, {task} tasks"""
else:
is_anchored = offset.isAnchored()
include_right = is_anchored or not hasattr(offset, "_inc")
include_right = is_anchored or not hasattr(offset, "delta")
if end == self.npartitions - 1:
divs = self.divisions
......@@ -2574,7 +2583,7 @@ Dask Name: {name}, {task} tasks"""
@property
def values(self):
""" Return a dask.array of the values of this dataframe
"""Return a dask.array of the values of this dataframe
Warning: This creates a dask.array without precise shape information.
Operations that depend on shape information, like slicing or reshaping,
......@@ -2641,7 +2650,7 @@ def _raise_if_object_series(x, funcname):
class Series(_Frame):
""" Parallel Pandas Series
"""Parallel Pandas Series
Do not use this class directly. Instead use functions like
``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.
......@@ -2827,7 +2836,7 @@ Dask Name: {name}, {task} tasks""".format(
"isn't monotonic_increasing"
)
raise ValueError(msg)
res.divisions = tuple(new.tolist())
res.divisions = tuple(methods.tolist(new))
else:
res = res.clear_divisions()
if inplace:
......@@ -2849,7 +2858,7 @@ Dask Name: {name}, {task} tasks""".format(
return df
def quantile(self, q=0.5, method="default"):
""" Approximate quantiles of Series
"""Approximate quantiles of Series
Parameters
----------
......@@ -2863,8 +2872,7 @@ Dask Name: {name}, {task} tasks""".format(
return quantile(self, q, method=method)
def _repartition_quantiles(self, npartitions, upsample=1.0):
""" Approximate quantiles of Series used for repartitioning
"""
"""Approximate quantiles of Series used for repartitioning"""
from .partitionquantiles import partition_quantiles
return partition_quantiles(self, npartitions, upsample=upsample)
......@@ -3133,7 +3141,7 @@ Dask Name: {name}, {task} tasks""".format(
@insert_meta_param_description(pad=12)
def apply(self, func, convert_dtype=True, meta=no_default, args=(), **kwds):
""" Parallel version of pandas.Series.apply
"""Parallel version of pandas.Series.apply
Parameters
----------
......@@ -3313,7 +3321,7 @@ class Index(Series):
return pd.Index(array, name=self.name)
def head(self, n=5, compute=True):
""" First n items of the Index.
"""First n items of the Index.
Caveat, this only checks the first partition.
"""
......@@ -3493,6 +3501,9 @@ class DataFrame(_Frame):
else:
return len(s)
def __contains__(self, key):
return key in self._meta
@property
def empty(self):
raise NotImplementedError(
......@@ -3597,7 +3608,7 @@ class DataFrame(_Frame):
return iter(self._meta)
def _ipython_key_completions_(self):
return self.columns.tolist()
return methods.tolist(self.columns)
@property
def ndim(self):
......@@ -3836,7 +3847,7 @@ class DataFrame(_Frame):
return self.map_partitions(M.rename, None, columns=columns)
def query(self, expr, **kwargs):
""" Filter dataframe with complex expression
"""Filter dataframe with complex expression
Blocked version of pd.DataFrame.query
......@@ -4106,7 +4117,7 @@ class DataFrame(_Frame):
left_index=on is None,
right_index=True,
left_on=on,
suffixes=[lsuffix, rsuffix],
suffixes=(lsuffix, rsuffix),
npartitions=npartitions,
shuffle=shuffle,
)
......@@ -4139,6 +4150,11 @@ class DataFrame(_Frame):
for row in df.itertuples(index=index, name=name):
yield row
@derived_from(pd.DataFrame)
def items(self):
for col_idx, label in enumerate(self.columns):
yield label, self.iloc[:, col_idx]
@classmethod
def _bind_operator_method(cls, name, op, original=pd.DataFrame):
""" bind operator method like DataFrame.add to this class """
......@@ -4213,7 +4229,7 @@ class DataFrame(_Frame):
meta=no_default,
**kwds,
):
""" Parallel version of pandas.DataFrame.apply
"""Parallel version of pandas.DataFrame.apply
This mimics the pandas version except for the following:
......@@ -4687,7 +4703,7 @@ def is_broadcastable(dfs, s):
def elemwise(op, *args, **kwargs):
""" Elementwise operation for Dask dataframes
"""Elementwise operation for Dask dataframes
Parameters
----------
......@@ -4747,7 +4763,7 @@ def elemwise(op, *args, **kwargs):
**kwargs,
)
if isinstance(divisions, pd.Index):
divisions = divisions.tolist()
divisions = methods.tolist(divisions)
except Exception:
pass
else:
......@@ -4790,7 +4806,7 @@ def elemwise(op, *args, **kwargs):
def handle_out(out, result):
""" Handle out parameters
"""Handle out parameters
If out is a dask.DataFrame, dask.Series or dask.Scalar then
this overwrites the contents of it with the result
......@@ -5134,7 +5150,7 @@ def map_partitions(
transform_divisions=True,
**kwargs,
):
""" Apply Python function on each DataFrame partition.
"""Apply Python function on each DataFrame partition.
Parameters
----------
......@@ -5238,7 +5254,7 @@ def map_partitions(
*[pd.Index(a.divisions) if a is dfs[0] else a for a in args], **kwargs
)
if isinstance(divisions, pd.Index):
divisions = divisions.tolist()
divisions = methods.tolist(divisions)
except Exception:
pass
else:
......@@ -5530,8 +5546,7 @@ def cov_corr(df, min_periods=None, corr=False, scalar=False, split_every=False):
def cov_corr_chunk(df, corr=False):
"""Chunk part of a covariance or correlation computation
"""
"""Chunk part of a covariance or correlation computation"""
shape = (df.shape[1], df.shape[1])
df = df.astype("float64", copy=False)
sums = zeros_like_safe(df.values, shape=shape)
......@@ -5619,7 +5634,7 @@ def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False):
def pd_split(df, p, random_state=None, shuffle=False):
""" Split DataFrame into multiple pieces pseudorandomly
"""Split DataFrame into multiple pieces pseudorandomly
>>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
... 'b': [2, 3, 4, 5, 6, 7]})
......@@ -5700,7 +5715,7 @@ def check_divisions(divisions):
def repartition_divisions(a, b, name, out1, out2, force=False):
""" dask graph to repartition dataframe by new divisions
"""dask graph to repartition dataframe by new divisions
Parameters
----------
......@@ -5857,7 +5872,9 @@ def repartition_freq(df, freq=None):
start = df.divisions[0].ceil(freq)
except ValueError:
start = df.divisions[0]
divisions = pd.date_range(start=start, end=df.divisions[-1], freq=freq).tolist()
divisions = methods.tolist(
pd.date_range(start=start, end=df.divisions[-1], freq=freq)
)
if not len(divisions):
divisions = [df.divisions[0], df.divisions[-1]]
else:
......@@ -5905,34 +5922,6 @@ def total_mem_usage(df, index=True, deep=False):
return mem_usage
def iter_chunks(sizes, max_size):
"""Split sizes into chunks of total max_size each
Parameters
----------
sizes : iterable of numbers
The sizes to be chunked
max_size : number
Maximum total size per chunk.
It must be greater or equal than each size in sizes
"""
chunk, chunk_sum = [], 0
iter_sizes = iter(sizes)
size = next(iter_sizes, None)
while size is not None:
assert size <= max_size
if chunk_sum + size <= max_size:
chunk.append(size)
chunk_sum += size
size = next(iter_sizes, None)
else:
assert chunk
yield chunk
chunk, chunk_sum = [], 0
if chunk:
yield chunk
def repartition_npartitions(df, npartitions):
""" Repartition dataframe to a smaller number of partitions """
new_name = "repartition-%d-%s" % (npartitions, tokenize(df))
......@@ -5964,8 +5953,8 @@ def repartition_npartitions(df, npartitions):
fp=divisions,
)
if np.issubdtype(original_divisions.dtype, np.datetime64):
divisions = (
pd.Series(divisions).astype(original_divisions.dtype).tolist()
divisions = methods.tolist(
pd.Series(divisions).astype(original_divisions.dtype)
)
elif np.issubdtype(original_divisions.dtype, np.integer):
divisions = divisions.astype(original_divisions.dtype)
......@@ -6003,7 +5992,7 @@ def _repartition_from_boundaries(df, new_partitions_boundaries, new_name):
def _split_partitions(df, nsplits, new_name):
""" Split a Dask dataframe into new partitions
"""Split a Dask dataframe into new partitions
Parameters
----------
......@@ -6040,7 +6029,7 @@ def _split_partitions(df, nsplits, new_name):
def repartition(df, divisions=None, force=False):
""" Repartition dataframe along new divisions
"""Repartition dataframe along new divisions
Dask.DataFrame objects are partitioned along their index. Often when
multiple dataframes interact we need to align these partitionings. The
......@@ -6412,7 +6401,7 @@ def meta_warning(df):
def prefix_reduction(f, ddf, identity, **kwargs):
""" Computes the prefix sums of f on df
"""Computes the prefix sums of f on df
If df has partitions [P1, P2, ..., Pn], then returns the DataFrame with
partitions [f(identity, P1),
......@@ -6474,7 +6463,7 @@ def prefix_reduction(f, ddf, identity, **kwargs):
def suffix_reduction(f, ddf, identity, **kwargs):
""" Computes the suffix sums of f on df
"""Computes the suffix sums of f on df
If df has partitions [P1, P2, ..., Pn], then returns the DataFrame with
partitions [f(P1, f(P2, ...f(Pn, identity)...)),
......
......@@ -64,8 +64,7 @@ from ..highlevelgraph import HighLevelGraph
def _determine_levels(index):
"""Determine the correct levels argument to groupby.
"""
"""Determine the correct levels argument to groupby."""
if isinstance(index, (tuple, list)) and len(index) > 1:
return list(range(len(index)))
else:
......@@ -73,8 +72,7 @@ def _determine_levels(index):
def _normalize_index(df, index):
"""Replace series with column names in an index wherever possible.
"""
"""Replace series with column names in an index wherever possible."""
if not isinstance(df, DataFrame):
return index
......@@ -983,7 +981,7 @@ def _cumcount_aggregate(a, b, fill_value=None):
class _GroupBy(object):
""" Superclass for DataFrameGroupBy and SeriesGroupBy
"""Superclass for DataFrameGroupBy and SeriesGroupBy
Parameters
----------
......@@ -1545,7 +1543,7 @@ class _GroupBy(object):
@insert_meta_param_description(pad=12)
def apply(self, func, *args, **kwargs):
""" Parallel version of pandas GroupBy.apply
"""Parallel version of pandas GroupBy.apply
This mimics the pandas version except for the following:
......@@ -1633,7 +1631,7 @@ class _GroupBy(object):
@insert_meta_param_description(pad=12)
def transform(self, func, *args, **kwargs):
""" Parallel version of pandas GroupBy.transform
"""Parallel version of pandas GroupBy.transform
This mimics the pandas version except for the following:
......@@ -1792,6 +1790,16 @@ class SeriesGroupBy(_GroupBy):
@derived_from(pd.core.groupby.SeriesGroupBy)
def nunique(self, split_every=None, split_out=1):
"""
Examples
--------
>>> import pandas as pd
>>> import dask.dataframe as dd
>>> d = {'col1': [1, 2, 3, 4], 'col2': [5, 6, 7, 8]}
>>> df = pd.DataFrame(data=d)
>>> ddf = dd.from_pandas(df, 2)
>>> ddf.groupby(['col1']).col2.nunique().compute()
"""
name = self._meta.obj.name
levels = _determine_levels(self.index)
......
......@@ -18,7 +18,7 @@ def compute_first_bit(a):
"Compute the position of the first nonzero bit for each int in an array."
# TODO: consider making this less memory-hungry
bits = np.bitwise_and.outer(a, 1 << np.arange(32))
bits = bits.cumsum(axis=1).astype(np.bool)
bits = bits.cumsum(axis=1).astype(bool)
return 33 - bits.sum(axis=1)
......
......@@ -283,7 +283,7 @@ class _LocIndexer(_IndexerBase):
def _partition_of_index_value(divisions, val):
""" In which partition does this value lie?
"""In which partition does this value lie?
>>> _partition_of_index_value([0, 5, 10], 3)
0
......@@ -303,7 +303,7 @@ def _partition_of_index_value(divisions, val):
def _partitions_of_index_values(divisions, values):
""" Return defaultdict of division and values pairs
"""Return defaultdict of division and values pairs
Each key corresponds to the division which values are index values belong
to the division.
......@@ -326,7 +326,7 @@ def _partitions_of_index_values(divisions, values):
def _coerce_loc_index(divisions, o):
""" Transform values to be comparable against divisions
"""Transform values to be comparable against divisions
This is particularly valuable to use with pandas datetimes
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment