Files
catalyst/zipline/lib/labelarray.py
T
2017-06-07 17:07:19 -04:00

717 lines
24 KiB
Python

"""
An ndarray subclass for working with arrays of strings.
"""
from functools import partial
from operator import eq, ne
import re
import numpy as np
from numpy import ndarray
import pandas as pd
from toolz import compose
from zipline.utils.compat import unicode
from zipline.utils.preprocess import preprocess
from zipline.utils.sentinel import sentinel
from zipline.utils.input_validation import (
coerce,
expect_kinds,
expect_types,
optional,
)
from zipline.utils.numpy_utils import (
bool_dtype,
unsigned_int_dtype_with_size_in_bytes,
is_object,
)
from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
from ._factorize import (
factorize_strings,
factorize_strings_known_categories,
smallest_uint_that_can_hold,
)
def compare_arrays(left, right):
"Eq check with a short-circuit for identical objects."
return (
left is right
or ((left.shape == right.shape) and (left == right).all())
)
def _make_unsupported_method(name):
def method(*args, **kwargs):
raise NotImplementedError(
"Method %s is not supported on LabelArrays." % name
)
method.__name__ = name
method.__doc__ = "Unsupported LabelArray Method: %s" % name
return method
class MissingValueMismatch(ValueError):
"""
Error raised on attempt to perform operations between LabelArrays with
mismatched missing_values.
"""
def __init__(self, left, right):
super(MissingValueMismatch, self).__init__(
"LabelArray missing_values don't match:"
" left={}, right={}".format(left, right)
)
class CategoryMismatch(ValueError):
"""
Error raised on attempt to perform operations between LabelArrays with
mismatched category arrays.
"""
def __init__(self, left, right):
(mismatches,) = np.where(left != right)
assert len(mismatches), "Not actually a mismatch!"
super(CategoryMismatch, self).__init__(
"LabelArray categories don't match:\n"
"Mismatched Indices: {mismatches}\n"
"Left: {left}\n"
"Right: {right}".format(
mismatches=mismatches,
left=left[mismatches],
right=right[mismatches],
)
)
_NotPassed = sentinel('_NotPassed')
class LabelArray(ndarray):
"""
An ndarray subclass for working with arrays of strings.
Factorizes the input array into integers, but overloads equality on strings
to check against the factor label.
Parameters
----------
values : array-like
Array of values that can be passed to np.asarray with dtype=object.
missing_value : str
Scalar value to treat as 'missing' for operations on ``self``.
categories : list[str], optional
List of values to use as categories. If not supplied, categories will
be inferred as the unique set of entries in ``values``.
sort : bool, optional
Whether to sort categories. If sort is False and categories is
supplied, they are left in the order provided. If sort is False and
categories is None, categories will be constructed in a random order.
Attributes
----------
categories : ndarray[str]
An array containing the unique labels of self.
reverse_categories : dict[str -> int]
Reverse lookup table for ``categories``. Stores the index in
``categories`` at which each entry each unique entry is found.
missing_value : str or None
A sentinel missing value with NaN semantics for comparisons.
Notes
-----
Consumers should be cautious when passing instances of LabelArray to numpy
functions. We attempt to disallow as many meaningless operations as
possible, but since a LabelArray is just an ndarray of ints with some
additional metadata, many numpy functions (for example, trigonometric) will
happily accept a LabelArray and treat its values as though they were
integers.
In a future change, we may be able to disallow more numerical operations by
creating a wrapper dtype which doesn't register an implementation for most
numpy ufuncs. Until that change is made, consumers of LabelArray should
assume that it is undefined behavior to pass a LabelArray to any numpy
ufunc that operates on semantically-numerical data.
See Also
--------
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
"""
SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)
@preprocess(
values=coerce(list, partial(np.asarray, dtype=object)),
categories=coerce(np.ndarray, list),
)
@expect_types(
values=np.ndarray,
missing_value=SUPPORTED_SCALAR_TYPES,
categories=optional(list),
)
@expect_kinds(values=("O", "S", "U"))
def __new__(cls,
values,
missing_value,
categories=None,
sort=True):
# Numpy's fixed-width string types aren't very efficient. Working with
# object arrays is faster than bytes or unicode arrays in almost all
# cases.
if not is_object(values):
values = values.astype(object)
if categories is None:
codes, categories, reverse_categories = factorize_strings(
values.ravel(),
missing_value=missing_value,
sort=sort,
)
else:
codes, categories, reverse_categories = (
factorize_strings_known_categories(
values.ravel(),
categories=categories,
missing_value=missing_value,
sort=sort,
)
)
categories.setflags(write=False)
return cls.from_codes_and_metadata(
codes=codes.reshape(values.shape),
categories=categories,
reverse_categories=reverse_categories,
missing_value=missing_value,
)
@classmethod
def from_codes_and_metadata(cls,
codes,
categories,
reverse_categories,
missing_value):
"""
Rehydrate a LabelArray from the codes and metadata.
Parameters
----------
codes : np.ndarray[integral]
The codes for the label array.
categories : np.ndarray[object]
The unique string categories.
reverse_categories : dict[str, int]
The mapping from category to its code-index.
missing_value : any
The value used to represent missing data.
"""
ret = codes.view(type=cls, dtype=np.void)
ret._categories = categories
ret._reverse_categories = reverse_categories
ret._missing_value = missing_value
return ret
@classmethod
def from_categorical(cls, categorical, missing_value=None):
"""
Create a LabelArray from a pandas categorical.
Parameters
----------
categorical : pd.Categorical
The categorical object to convert.
missing_value : bytes, unicode, or None, optional
The missing value to use for this LabelArray.
Returns
-------
la : LabelArray
The LabelArray representation of this categorical.
"""
return LabelArray(
categorical,
missing_value,
categorical.categories,
)
@property
def categories(self):
# This is a property because it should be immutable.
return self._categories
@property
def reverse_categories(self):
# This is a property because it should be immutable.
return self._reverse_categories
@property
def missing_value(self):
# This is a property because it should be immutable.
return self._missing_value
@property
def missing_value_code(self):
return self.reverse_categories[self.missing_value]
def has_label(self, value):
return value in self.reverse_categories
def __array_finalize__(self, obj):
"""
Called by Numpy after array construction.
There are three cases where this can happen:
1. Someone tries to directly construct a new array by doing::
>>> ndarray.__new__(LabelArray, ...) # doctest: +SKIP
In this case, obj will be None. We treat this as an error case and
fail.
2. Someone (most likely our own __new__) does::
>>> other_array.view(type=LabelArray) # doctest: +SKIP
In this case, `self` will be the new LabelArray instance, and
``obj` will be the array on which ``view`` is being called.
The caller of ``obj.view`` is responsible for setting category
metadata on ``self`` after we exit.
3. Someone creates a new LabelArray by slicing an existing one.
In this case, ``obj`` will be the original LabelArray. We're
responsible for copying over the parent array's category metadata.
"""
if obj is None:
raise TypeError(
"Direct construction of LabelArrays is not supported."
)
# See docstring for an explanation of when these will or will not be
# set.
self._categories = getattr(obj, 'categories', None)
self._reverse_categories = getattr(obj, 'reverse_categories', None)
self._missing_value = getattr(obj, 'missing_value', None)
def as_int_array(self):
"""
Convert self into a regular ndarray of ints.
This is an O(1) operation. It does not copy the underlying data.
"""
return self.view(
type=ndarray,
dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize),
)
def as_string_array(self):
"""
Convert self back into an array of strings.
This is an O(N) operation.
"""
return self.categories[self.as_int_array()]
def as_categorical(self, name=None):
"""
Coerce self into a pandas categorical.
This is only defined on 1D arrays, since that's all pandas supports.
"""
if len(self.shape) > 1:
raise ValueError("Can't convert a 2D array to a categorical.")
with ignore_pandas_nan_categorical_warning():
return pd.Categorical.from_codes(
self.as_int_array(),
# We need to make a copy because pandas >= 0.17 fails if this
# buffer isn't writeable.
self.categories.copy(),
ordered=False,
name=name,
)
def as_categorical_frame(self, index, columns, name=None):
"""
Coerce self into a pandas DataFrame of Categoricals.
"""
if len(self.shape) != 2:
raise ValueError(
"Can't convert a non-2D LabelArray into a DataFrame."
)
expected_shape = (len(index), len(columns))
if expected_shape != self.shape:
raise ValueError(
"Can't construct a DataFrame with provided indices:\n\n"
"LabelArray shape is {actual}, but index and columns imply "
"that shape should be {expected}.".format(
actual=self.shape,
expected=expected_shape,
)
)
return pd.Series(
index=pd.MultiIndex.from_product([index, columns]),
data=self.ravel().as_categorical(name=name),
).unstack()
def __setitem__(self, indexer, value):
self_categories = self.categories
if isinstance(value, LabelArray):
value_categories = value.categories
if compare_arrays(self_categories, value_categories):
return super(LabelArray, self).__setitem__(indexer, value)
else:
raise CategoryMismatch(self_categories, value_categories)
elif isinstance(value, self.SUPPORTED_SCALAR_TYPES):
value_code = self.reverse_categories.get(value, -1)
if value_code < 0:
raise ValueError("%r is not in LabelArray categories." % value)
self.as_int_array()[indexer] = value_code
else:
raise NotImplementedError(
"Setting into a LabelArray with a value of "
"type {type} is not yet supported.".format(
type=type(value).__name__,
),
)
def __setslice__(self, i, j, sequence):
"""
This method was deprecated in Python 2.0. It predates slice objects,
but Python 2.7.11 still uses it if you implement it, which ndarray
does. In newer Pythons, __setitem__ is always called, but we need to
manuallly forward in py2.
"""
self.__setitem__(slice(i, j), sequence)
def __getitem__(self, indexer):
result = super(LabelArray, self).__getitem__(indexer)
if result.ndim:
# Result is still a LabelArray, so we can just return it.
return result
# Result is a scalar value, which will be an instance of np.void.
# Map it back to one of our category entries.
index = result.view(
unsigned_int_dtype_with_size_in_bytes(self.itemsize),
)
return self.categories[index]
def is_missing(self):
"""
Like isnan, but checks for locations where we store missing values.
"""
return (
self.as_int_array() == self.reverse_categories[self.missing_value]
)
def not_missing(self):
"""
Like ~isnan, but checks for locations where we store missing values.
"""
return (
self.as_int_array() != self.reverse_categories[self.missing_value]
)
def _equality_check(op):
"""
Shared code for __eq__ and __ne__, parameterized on the actual
comparison operator to use.
"""
def method(self, other):
if isinstance(other, LabelArray):
self_mv = self.missing_value
other_mv = other.missing_value
if self_mv != other_mv:
raise MissingValueMismatch(self_mv, other_mv)
self_categories = self.categories
other_categories = other.categories
if not compare_arrays(self_categories, other_categories):
raise CategoryMismatch(self_categories, other_categories)
return (
op(self.as_int_array(), other.as_int_array())
& self.not_missing()
& other.not_missing()
)
elif isinstance(other, ndarray):
# Compare to ndarrays as though we were an array of strings.
# This is fairly expensive, and should generally be avoided.
return op(self.as_string_array(), other) & self.not_missing()
elif isinstance(other, self.SUPPORTED_SCALAR_TYPES):
i = self._reverse_categories.get(other, -1)
return op(self.as_int_array(), i) & self.not_missing()
return op(super(LabelArray, self), other)
return method
__eq__ = _equality_check(eq)
__ne__ = _equality_check(ne)
del _equality_check
def view(self, dtype=_NotPassed, type=_NotPassed):
if type is _NotPassed and dtype not in (_NotPassed, self.dtype):
raise TypeError("Can't view LabelArray as another dtype.")
# The text signature on ndarray.view makes it look like the default
# values for dtype and type are `None`, but passing None explicitly has
# different semantics than not passing an arg at all, so we reconstruct
# the kwargs dict here to simulate the args not being passed at all.
kwargs = {}
if dtype is not _NotPassed:
kwargs['dtype'] = dtype
if type is not _NotPassed:
kwargs['type'] = type
return super(LabelArray, self).view(**kwargs)
# In general, we support resizing, slicing, and reshaping methods, but not
# numeric methods.
SUPPORTED_NDARRAY_METHODS = frozenset([
'base',
'compress',
'copy',
'data',
'diagonal',
'dtype',
'flat',
'flatten',
'item',
'itemset',
'itemsize',
'nbytes',
'ndim',
'ravel',
'repeat',
'reshape',
'resize',
'setflags',
'shape',
'size',
'squeeze',
'strides',
'swapaxes',
'take',
'trace',
'transpose',
'view'
])
PUBLIC_NDARRAY_METHODS = frozenset([
s for s in dir(ndarray) if not s.startswith('_')
])
# Generate failing wrappers for all unsupported methods.
locals().update(
{
method: _make_unsupported_method(method)
for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS
}
)
def __repr__(self):
repr_lines = repr(self.as_string_array()).splitlines()
repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1)
repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')'
# The extra spaces here account for the difference in length between
# 'array(' and 'LabelArray('.
return '\n '.join(repr_lines)
def empty_like(self, shape):
"""
Make an empty LabelArray with the same categories as ``self``, filled
with ``self.missing_value``.
"""
return type(self).from_codes_and_metadata(
codes=np.full(
shape,
self.reverse_categories[self.missing_value],
dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize),
),
categories=self.categories,
reverse_categories=self.reverse_categories,
missing_value=self.missing_value,
)
def map_predicate(self, f):
"""
Map a function from str -> bool element-wise over ``self``.
``f`` will be applied exactly once to each non-missing unique value in
``self``. Missing values will always return False.
"""
# Functions passed to this are of type str -> bool. Don't ever call
# them on None, which is the only non-str value we ever store in
# categories.
if self.missing_value is None:
def f_to_use(x):
return False if x is None else f(x)
else:
f_to_use = f
# Call f on each unique value in our categories.
results = np.vectorize(f_to_use, otypes=[bool_dtype])(self.categories)
# missing_value should produce False no matter what
results[self.reverse_categories[self.missing_value]] = False
# unpack the results form each unique value into their corresponding
# locations in our indices.
return results[self.as_int_array()]
def map(self, f):
"""
Map a function from str -> str element-wise over ``self``.
``f`` will be applied exactly once to each non-missing unique value in
``self``. Missing values will always map to ``self.missing_value``.
"""
# f() should only return None if None is our missing value.
if self.missing_value is None:
allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
else:
allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES
def f_to_use(x,
missing_value=self.missing_value,
otypes=allowed_outtypes):
if x == missing_value:
return x
ret = f(x)
if not isinstance(ret, otypes):
raise TypeError(
"LabelArray.map expected function {f} to return a string"
" or None, but got {type} instead.\n"
"Value was {value}.".format(
f=f.__name__,
type=type(ret).__name__,
value=ret,
)
)
return ret
new_categories_with_duplicates = (
np.vectorize(f_to_use, otypes=[object])(self.categories)
)
# If f() maps multiple inputs to the same output, then we can end up
# with the same code duplicated multiple times. Compress the categories
# by running them through np.unique, and then use the reverse lookup
# table to compress codes as well.
new_categories, bloated_reverse_index = np.unique(
new_categories_with_duplicates,
return_inverse=True
)
# `reverse_index` will always be a 64 bit integer even if we can hold a
# smaller array.
reverse_index = bloated_reverse_index.astype(
smallest_uint_that_can_hold(len(new_categories))
)
new_codes = np.take(reverse_index, self.as_int_array())
return self.from_codes_and_metadata(
new_codes,
new_categories,
dict(zip(new_categories, range(len(new_categories)))),
missing_value=self.missing_value,
)
def startswith(self, prefix):
"""
Element-wise startswith.
Parameters
----------
prefix : str
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self started with ``prefix``.
"""
return self.map_predicate(lambda elem: elem.startswith(prefix))
def endswith(self, suffix):
"""
Elementwise endswith.
Parameters
----------
suffix : str
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self ended with ``suffix``
"""
return self.map_predicate(lambda elem: elem.endswith(suffix))
def has_substring(self, substring):
"""
Elementwise contains.
Parameters
----------
substring : str
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self ended with ``suffix``.
"""
return self.map_predicate(lambda elem: substring in elem)
@preprocess(pattern=coerce(from_=(bytes, unicode), to=re.compile))
def matches(self, pattern):
"""
Elementwise regex match.
Parameters
----------
pattern : str or compiled regex
Returns
-------
matches : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self was matched by ``pattern``.
"""
return self.map_predicate(compose(bool, pattern.match))
# These types all implement an O(N) __contains__, so pre-emptively
# coerce to `set`.
@preprocess(container=coerce((list, tuple, np.ndarray), set))
def element_of(self, container):
"""
Check if each element of self is an of ``container``.
Parameters
----------
container : object
An object implementing a __contains__ to call on each element of
``self``.
Returns
-------
is_contained : np.ndarray[bool]
An array with the same shape as self indicating whether each
element of self was an element of ``container``.
"""
return self.map_predicate(container.__contains__)