mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-28 19:47:13 +08:00
504 lines
16 KiB
Python
504 lines
16 KiB
Python
"""
|
|
An ndarray subclass for working with arrays of strings.
|
|
"""
|
|
from functools import partial
|
|
from numbers import Number
|
|
from operator import eq, ne
|
|
import re
|
|
|
|
import numpy as np
|
|
from numpy import ndarray
|
|
import pandas as pd
|
|
from toolz import compose
|
|
|
|
from zipline.utils.preprocess import preprocess
|
|
from zipline.utils.sentinel import sentinel
|
|
from zipline.utils.input_validation import (
|
|
coerce,
|
|
expect_kinds,
|
|
expect_types,
|
|
optional,
|
|
)
|
|
from zipline.utils.numpy_utils import is_object, int64_dtype
|
|
|
|
from ._factorize import (
|
|
factorize_strings,
|
|
factorize_strings_known_categories,
|
|
)
|
|
|
|
|
|
def compare_arrays(left, right):
|
|
"Eq check with a short-circuit for identical objects."
|
|
return (
|
|
left is right
|
|
or ((left.shape == right.shape) and (left == right).all())
|
|
)
|
|
|
|
|
|
def _make_unsupported_method(name):
|
|
def method(*args, **kwargs):
|
|
raise NotImplementedError(
|
|
"Method %s is not supported on LabelArrays." % name
|
|
)
|
|
method.__name__ = name
|
|
method.__doc__ = "Unsupported LabelArray Method: %s" % name
|
|
return method
|
|
|
|
|
|
class CategoryMismatch(ValueError):
|
|
"""
|
|
Error raised on attempt to perform operations between LabelArrays with
|
|
mismatched category arrays.
|
|
"""
|
|
def __init__(self, left, right):
|
|
(mismatches,) = np.where(left != right)
|
|
assert len(mismatches), "Not actually a mismatch!"
|
|
super(CategoryMismatch, self).__init__(
|
|
"LabelArray categories don't match:\n"
|
|
"Mismatched Indices: {mismatches}\n"
|
|
"Left: {left}\n"
|
|
"Right: {right}".format(
|
|
mismatches=mismatches,
|
|
left=left[mismatches],
|
|
right=right[mismatches],
|
|
)
|
|
)
|
|
|
|
_NotPassed = sentinel('_NotPassed')
|
|
|
|
|
|
class LabelArray(ndarray):
|
|
"""
|
|
An ndarray subclass for working with arrays of strings.
|
|
|
|
Factorizes the input array into integers, but overloads equality on strings
|
|
to check against the factor label.
|
|
|
|
See Also
|
|
--------
|
|
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
|
|
"""
|
|
@preprocess(
|
|
values=coerce(list, partial(np.asarray, dtype=object)),
|
|
)
|
|
@expect_types(
|
|
values=np.ndarray,
|
|
categories=optional(list),
|
|
)
|
|
@expect_kinds(values=("O", "S", "U"))
|
|
def __new__(cls,
|
|
values,
|
|
missing_value,
|
|
categories=None,
|
|
sort=True):
|
|
|
|
# Numpy's fixed-width string types aren't very efficient. Working with
|
|
# object arrays is faster than bytes or unicode arrays in almost all
|
|
# cases.
|
|
if not is_object(values):
|
|
values = values.astype(object)
|
|
|
|
if categories is None:
|
|
codes, categories, reverse_categories = factorize_strings(
|
|
values.ravel(),
|
|
missing_value=missing_value,
|
|
sort=sort,
|
|
)
|
|
else:
|
|
codes, categories, reverse_categories = (
|
|
factorize_strings_known_categories(
|
|
values.ravel(),
|
|
categories=categories,
|
|
missing_value=missing_value,
|
|
sort=sort,
|
|
)
|
|
)
|
|
categories.setflags(write=False)
|
|
|
|
ret = codes.reshape(values.shape).view(type=cls)
|
|
ret._categories = categories
|
|
ret._reverse_categories = reverse_categories
|
|
ret._missing_value = missing_value
|
|
return ret
|
|
|
|
@property
|
|
def categories(self):
|
|
# This is a property because it should be immutable.
|
|
return self._categories
|
|
|
|
@property
|
|
def reverse_categories(self):
|
|
# This is a property because it should be immutable.
|
|
return self._reverse_categories
|
|
|
|
@property
|
|
def missing_value(self):
|
|
# This is a property because it should be immutable.
|
|
return self._missing_value
|
|
|
|
def __array_finalize__(self, obj):
|
|
"""
|
|
Called by Numpy after array construction.
|
|
|
|
There are three cases where this can happen:
|
|
|
|
1. Someone tries to directly construct a new array by doing::
|
|
|
|
>>> ndarray.__new__(LabelArray, ...)
|
|
|
|
In this case, obj will be None. We treat this as an error case and
|
|
fail.
|
|
|
|
2. Someone (most likely our own __new__) calls
|
|
other_array.view(type=LabelArray).
|
|
|
|
In this case, `self` will be the new LabelArray instance, and
|
|
``obj` will be the array on which ``view`` is being called.
|
|
|
|
The caller of ``obj.view`` is responsible for copying setting
|
|
category metadata on ``self`` after we exit.
|
|
|
|
3. Someone creates a new LabelArray by slicing an existing one.
|
|
|
|
In this case, ``obj`` will be the original LabelArray. We're
|
|
responsible for copying over the parent array's category metadata.
|
|
"""
|
|
if obj is None:
|
|
raise TypeError(
|
|
"Direct construction of LabelArrays is not supported."
|
|
)
|
|
|
|
# See docstring for an explanation of when these will or will not be
|
|
# set.
|
|
self._categories = getattr(obj, 'categories', None)
|
|
self._reverse_categories = getattr(obj, 'reverse_categories', None)
|
|
self._missing_value = getattr(obj, 'missing_value', None)
|
|
|
|
def __array_wrap__(self, obj, context=None):
|
|
"""
|
|
Called by numpy after completion of a ufunc.
|
|
|
|
We coerce back into a vanilla ndarray if our dtype changed, since that
|
|
indicates that our categories are no longer meaningful.
|
|
"""
|
|
if obj.dtype != self.dtype:
|
|
return obj.view(type=np.ndarray)
|
|
return obj
|
|
|
|
def as_int_array(self):
|
|
"""
|
|
Convert self into a regular ndarray of ints.
|
|
|
|
This is an O(1) operation. It does not copy the underlying data.
|
|
"""
|
|
return self.view(type=ndarray)
|
|
|
|
def as_string_array(self):
|
|
"""
|
|
Convert self back into an array of strings.
|
|
|
|
This is an O(N) operation.
|
|
"""
|
|
return self.categories[self]
|
|
|
|
def as_categorical(self, name=None):
|
|
"""
|
|
Coerce self into a pandas categorical.
|
|
|
|
This is only defined on 1D arrays, since that's all pandas supports.
|
|
"""
|
|
if len(self.shape) > 1:
|
|
raise ValueError("Can't convert a 2D array to a categorical.")
|
|
return pd.Categorical.from_codes(
|
|
self.as_int_array(),
|
|
self.categories,
|
|
ordered=False,
|
|
name=name,
|
|
)
|
|
|
|
def as_categorical_frame(self, index, columns, name=None):
|
|
"""
|
|
Coerce self into a pandas DataFrame of Categoricals.
|
|
"""
|
|
if len(self.shape) != 2:
|
|
raise ValueError(
|
|
"Can't convert a non-2D LabelArray into a DataFrame."
|
|
)
|
|
|
|
expected_shape = (len(index), len(columns))
|
|
if expected_shape != self.shape:
|
|
raise ValueError(
|
|
"Can't construct a DataFrame with provided indices:\n\n"
|
|
"LabelArray shape is {actual}, but index and columns imply "
|
|
"that shape should be {expected}.".format(
|
|
actual=self.shape,
|
|
expected=expected_shape,
|
|
)
|
|
)
|
|
|
|
return pd.Series(
|
|
index=pd.MultiIndex.from_product([index, columns]),
|
|
data=self.ravel().as_categorical(name=name),
|
|
).unstack()
|
|
|
|
def __setitem__(self, indexer, value):
|
|
self_categories = self.categories
|
|
|
|
if isinstance(value, LabelArray):
|
|
value_categories = value.categories
|
|
if compare_arrays(self_categories, value_categories):
|
|
return super(LabelArray, self).__setitem__(indexer, value)
|
|
else:
|
|
raise CategoryMismatch(self_categories, value_categories)
|
|
|
|
elif isinstance(value, (bytes, unicode)):
|
|
value_code = self.reverse_categories.get(value, None)
|
|
if value_code is None:
|
|
raise ValueError("%r is not in LabelArray categories." % value)
|
|
return super(LabelArray, self).__setitem__(indexer, value_code)
|
|
|
|
else:
|
|
raise NotImplementedError(
|
|
"Setting into a LabelArray with a value of "
|
|
"type {type} is not yet supported.".format(
|
|
type=type(value).__name__,
|
|
),
|
|
)
|
|
|
|
def _equality_check(op):
|
|
"""
|
|
Shared code for __eq__ and __ne__, parameterized on the actual
|
|
comparison operator to use.
|
|
"""
|
|
# What value should we return if we compare against a value not in our
|
|
# categories?
|
|
if op is eq:
|
|
COMPARE_TO_UNKNOWN = False
|
|
elif op is ne:
|
|
COMPARE_TO_UNKNOWN = True
|
|
else:
|
|
raise AssertionError("_make_equality_check called with %s" % op)
|
|
|
|
def method(self, other):
|
|
self_categories = self.categories
|
|
|
|
if isinstance(other, LabelArray):
|
|
other_categories = other.categories
|
|
if compare_arrays(self_categories, other_categories):
|
|
return op(self.as_int_array(), other.as_int_array())
|
|
else:
|
|
raise CategoryMismatch(self_categories, other_categories)
|
|
|
|
elif isinstance(other, ndarray):
|
|
# Compare to ndarrays as though we were an array of strings.
|
|
# This is fairly expensive, and should generally be avoided.
|
|
return op(self.as_string_array(), other)
|
|
|
|
elif isinstance(other, (bytes, unicode)):
|
|
i = self._reverse_categories.get(other, None)
|
|
if i is None:
|
|
# Requested string isn't in our categories. Short circuit.
|
|
# This isn't full_like because that would try to return a
|
|
# LabelArray.
|
|
return np.full(self.shape, COMPARE_TO_UNKNOWN, dtype=bool)
|
|
|
|
return op(self.as_int_array(), i)
|
|
|
|
elif isinstance(other, Number):
|
|
return NotImplemented
|
|
|
|
return op(super(LabelArray, self), other)
|
|
return method
|
|
|
|
__eq__ = _equality_check(eq)
|
|
__ne__ = _equality_check(ne)
|
|
del _equality_check
|
|
|
|
def view(self, dtype=_NotPassed, type=_NotPassed):
|
|
if type is _NotPassed and dtype not in (_NotPassed, self.dtype):
|
|
raise TypeError("Can't view LabelArray as another dtype.")
|
|
|
|
# The text signature on ndarray.view makes it look like the default
|
|
# values for dtype and type are `None`, but passing None explicitly has
|
|
# different semantics than not passing an arg at all, so we reconstruct
|
|
# the kwargs dict here to simulate the args not being passed at all.
|
|
kwargs = {}
|
|
if dtype is not _NotPassed:
|
|
kwargs['dtype'] = dtype
|
|
if type is not _NotPassed:
|
|
kwargs['type'] = type
|
|
return super(LabelArray, self).view(**kwargs)
|
|
|
|
# In general, we support resizing, slicing, and reshaping methods, but not
|
|
# numeric methods.
|
|
SUPPORTED_NDARRAY_METHODS = frozenset([
|
|
'base',
|
|
'byteswap',
|
|
'compress',
|
|
'copy',
|
|
'data',
|
|
'diagonal',
|
|
'dtype',
|
|
'flat',
|
|
'flatten',
|
|
'item',
|
|
'itemset',
|
|
'itemsize',
|
|
'nbytes',
|
|
'ndim',
|
|
'newbyteorder',
|
|
'ravel',
|
|
'repeat',
|
|
'reshape',
|
|
'resize',
|
|
'setflags',
|
|
'shape',
|
|
'size',
|
|
'squeeze',
|
|
'strides',
|
|
'swapaxes',
|
|
'take',
|
|
'trace',
|
|
'transpose',
|
|
'view'
|
|
])
|
|
PUBLIC_NDARRAY_METHODS = frozenset([
|
|
s for s in dir(ndarray) if not s.startswith('_')
|
|
])
|
|
|
|
# Generate failing wrappers for all unsupported methods.
|
|
locals().update(
|
|
{
|
|
method: _make_unsupported_method(method)
|
|
for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS
|
|
}
|
|
)
|
|
|
|
def __repr__(self):
|
|
# This happens if you call a ufunc on a LabelArray that changes the
|
|
# dtype. This is generally an indicator that the array has been used
|
|
# incorrectly, and it means we're no longer valid for anything.
|
|
if self.dtype != int64_dtype:
|
|
return "Invalid LabelArray: dtype={}, shape={}".format(
|
|
self.dtype, self.shape
|
|
)
|
|
repr_lines = repr(self.as_string_array()).splitlines()
|
|
repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1)
|
|
repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')'
|
|
# The extra spaces here account for the difference in length between
|
|
# 'array(' and 'LabelArray('.
|
|
return '\n '.join(repr_lines)
|
|
|
|
def empty_like(self, shape):
|
|
"""
|
|
Make an empty LabelArray with the same categories as ``self``, filled
|
|
with ``self.missing_value``.
|
|
"""
|
|
out = np.full(
|
|
shape,
|
|
self.reverse_categories[self.missing_value],
|
|
dtype=self.dtype
|
|
).view(
|
|
type=type(self)
|
|
)
|
|
|
|
out._categories = self.categories
|
|
out._reverse_categories = self.reverse_categories
|
|
out._missing_value = self.missing_value
|
|
|
|
return out
|
|
|
|
def apply(self, f, dtype):
|
|
"""
|
|
Map a function elementwise over entries in ``self``.
|
|
|
|
``f`` will be applied exactly once to each unique value in ``self``.
|
|
"""
|
|
return np.vectorize(f, otypes=[dtype])(self.categories)[self]
|
|
|
|
def startswith(self, prefix):
|
|
"""
|
|
Element-wise startswith.
|
|
|
|
Parameters
|
|
----------
|
|
prefix : str
|
|
|
|
Returns
|
|
-------
|
|
matches : np.ndarray[bool]
|
|
An array with the same shape as self indicating whether each
|
|
element of self started with ``prefix``.
|
|
"""
|
|
return self.apply(lambda elem: elem.startswith(prefix), dtype=bool)
|
|
|
|
def endswith(self, suffix):
|
|
"""
|
|
Elementwise endswith.
|
|
|
|
Parameters
|
|
----------
|
|
suffix : str
|
|
|
|
Returns
|
|
-------
|
|
matches : np.ndarray[bool]
|
|
An array with the same shape as self indicating whether each
|
|
element of self ended with ``suffix``.w
|
|
"""
|
|
return self.apply(lambda elem: elem.endswith(suffix), dtype=bool)
|
|
|
|
def has_substring(self, substring):
|
|
"""
|
|
Elementwise contains.
|
|
|
|
Parameters
|
|
----------
|
|
substring : str
|
|
|
|
Returns
|
|
-------
|
|
matches : np.ndarray[bool]
|
|
An array with the same shape as self indicating whether each
|
|
element of self ended with ``suffix``.
|
|
"""
|
|
return self.apply(lambda elem: substring in elem, dtype=bool)
|
|
|
|
@preprocess(pattern=coerce(from_=(bytes, unicode), to=re.compile))
|
|
def matches(self, pattern):
|
|
"""
|
|
Elementwise regex match.
|
|
|
|
Parameters
|
|
----------
|
|
pattern : str or compiled regex
|
|
|
|
Returns
|
|
-------
|
|
matches : np.ndarray[bool]
|
|
An array with the same shape as self indicating whether each
|
|
element of self was matched by ``pattern``.
|
|
"""
|
|
return self.apply(compose(bool, pattern.match), dtype=bool)
|
|
|
|
# These types all implement an O(N) __contains__, so pre-emptively
|
|
# coerce to `set`.
|
|
@preprocess(container=coerce((list, tuple, np.ndarray), set))
|
|
def element_of(self, container):
|
|
"""
|
|
Check if each element of self is an of ``container``.
|
|
|
|
Parameters
|
|
----------
|
|
container : object
|
|
An object implementing a __contains__ to call on each element of
|
|
``self``.
|
|
|
|
Returns
|
|
-------
|
|
is_contained : np.ndarray[bool]
|
|
An array with the same shape as self indicating whether each
|
|
element of self was an element of ``container``.
|
|
"""
|
|
return self.apply(container.__contains__, dtype=bool)
|