Merge pull request #1833 from quantopian/labelarray-map

Add support for relabeling classifiers.
This commit is contained in:
Scott Sanderson
2017-06-07 17:57:18 -07:00
committed by GitHub
7 changed files with 427 additions and 30 deletions
+116
View File
@@ -468,6 +468,122 @@ class ClassifierTestCase(BasePipelineTestCase):
)
self.assertEqual(errmsg, expected)
@parameter_space(
__fail_fast=True,
labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
relabel_func=[
lambda s: str(s[0]),
lambda s: str(len(s)),
lambda s: str(len([c for c in s if c == 'a'])),
lambda s: None,
]
)
def test_relabel_strings(self, relabel_func, labelarray_dtype):
class C(Classifier):
inputs = ()
dtype = categorical_dtype
missing_value = None
window_length = 0
c = C()
raw = np.asarray(
[['a', 'aa', 'aaa', 'abab'],
['bab', 'aba', 'aa', 'bb'],
['a', 'aba', 'abaa', 'abaab'],
['a', 'aa', 'aaa', 'aaaa']],
dtype=labelarray_dtype,
)
raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)
data = LabelArray(raw, missing_value=None)
terms = {
'relabeled': c.relabel(relabel_func),
}
expected_results = {
'relabeled': LabelArray(raw_relabeled, missing_value=None),
}
self.check_terms(
terms,
expected_results,
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
@parameter_space(
__fail_fast=True,
missing_value=[None, 'M'],
)
def test_relabel_missing_value_interactions(self, missing_value):
mv = missing_value
class C(Classifier):
inputs = ()
dtype = categorical_dtype
missing_value = mv
window_length = 0
c = C()
def relabel_func(s):
if s == 'B':
return mv
return ''.join([s, s])
raw = np.asarray(
[['A', 'B', 'C', mv],
[mv, 'A', 'B', 'C'],
['C', mv, 'A', 'B'],
['B', 'C', mv, 'A']],
dtype=categorical_dtype,
)
data = LabelArray(raw, missing_value=mv)
expected_relabeled_raw = np.asarray(
[['AA', mv, 'CC', mv],
[mv, 'AA', mv, 'CC'],
['CC', mv, 'AA', mv],
[mv, 'CC', mv, 'AA']],
dtype=categorical_dtype,
)
terms = {
'relabeled': c.relabel(relabel_func),
}
expected_results = {
'relabeled': LabelArray(expected_relabeled_raw, missing_value=mv),
}
self.check_terms(
terms,
expected_results,
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
def test_relabel_int_classifier_not_yet_supported(self):
class C(Classifier):
inputs = ()
dtype = int64_dtype
missing_value = -1
window_length = 0
c = C()
with self.assertRaises(TypeError) as e:
c.relabel(lambda x: 0 / 0) # Function should never be called.
result = str(e.exception)
expected = (
"relabel() is only defined on Classifiers producing strings "
"but it was called on a Classifier of dtype int64."
)
self.assertEqual(result, expected)
class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
def test_reversability_categorical(self):
+132
View File
@@ -109,6 +109,71 @@ class LabelArrayTestCase(ZiplineTestCase):
np_contains(strs) & notmissing,
)
@parameter_space(
__fail_fast=True,
f=[
lambda s: str(len(s)),
lambda s: s[0],
lambda s: ''.join(reversed(s)),
lambda s: '',
]
)
def test_map(self, f):
data = np.array(
[['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
dtype=object,
)
la = LabelArray(data, missing_value=None)
numpy_transformed = np.vectorize(f)(data)
la_transformed = la.map(f).as_string_array()
assert_equal(numpy_transformed, la_transformed)
@parameter_space(missing=['A', None])
def test_map_ignores_missing_value(self, missing):
data = np.array([missing, 'B', 'C'], dtype=object)
la = LabelArray(data, missing_value=missing)
def increment_char(c):
return chr(ord(c) + 1)
result = la.map(increment_char)
expected = LabelArray([missing, 'C', 'D'], missing_value=missing)
assert_equal(result.as_string_array(), expected.as_string_array())
@parameter_space(
__fail_fast=True,
f=[
lambda s: 0,
lambda s: 0.0,
lambda s: object(),
]
)
def test_map_requires_f_to_return_a_string_or_none(self, f):
la = LabelArray(self.strs, missing_value=None)
with self.assertRaises(TypeError):
la.map(f)
def test_map_can_only_return_none_if_missing_value_is_none(self):
# Should work.
la = LabelArray(self.strs, missing_value=None)
result = la.map(lambda x: None)
check_arrays(
result,
LabelArray(np.full_like(self.strs, None), missing_value=None),
)
la = LabelArray(self.strs, missing_value="__MISSING__")
with self.assertRaises(TypeError):
la.map(lambda x: None)
@parameter_space(
__fail_fast=True,
missing_value=('', 'a', 'not in the array', None),
@@ -436,6 +501,73 @@ class LabelArrayTestCase(ZiplineTestCase):
assert_equal(arr.itemsize, 2)
self.check_roundtrip(arr)
def test_map_shrinks_code_storage_if_possible(self):
arr = LabelArray(
# Drop the last value so we fit in a uint16 with None as a missing
# value.
self.create_categories(16, plus_one=False)[:-1],
missing_value=None,
)
self.assertEqual(arr.itemsize, 2)
def either_A_or_B(s):
return ('A', 'B')[sum(ord(c) for c in s) % 2]
result = arr.map(either_A_or_B)
self.assertEqual(set(result.categories), {'A', 'B', None})
self.assertEqual(result.itemsize, 1)
assert_equal(
np.vectorize(either_A_or_B)(arr.as_string_array()),
result.as_string_array(),
)
def test_map_never_increases_code_storage_size(self):
# This tests a pathological case where a user maps an impure function
# that returns a different label on every invocation, which in a naive
# implementation could cause us to need to **increase** the size of our
# codes after a map.
#
# This doesn't happen, however, because we guarantee that the user's
# mapping function will be called on each unique category exactly once,
# which means we can never increase the number of categories in the
# LabelArray after mapping.
# Using all but one of the categories so that we still fit in a uint8
# with an extra category for None as a missing value.
categories = self.create_categories(8, plus_one=False)[:-1]
larger_categories = self.create_categories(16, plus_one=False)
# Double the length of the categories so that we have to increase the
# required size after our map.
categories_twice = categories + categories
arr = LabelArray(categories_twice, missing_value=None)
assert_equal(arr.itemsize, 1)
gen_unique_categories = iter(larger_categories)
def new_string_every_time(c):
# Return a new unique category every time so that every result is
# different.
return next(gen_unique_categories)
result = arr.map(new_string_every_time)
# Result should still be of size 1.
assert_equal(result.itemsize, 1)
# Result should be the first `len(categories)` entries from the larger
# categories, repeated twice.
expected = LabelArray(
larger_categories[:len(categories)] * 2,
missing_value=None,
)
assert_equal(result.as_string_array(), expected.as_string_array())
def manual_narrow_condense_back_to_valid_size_slow(self):
"""This test is really slow so we don't want it run by default.
"""
+18 -13
View File
@@ -14,6 +14,23 @@ cdef inline double log2(double d):
return log(d) / log(2);
cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
"""Choose the smallest numpy unsigned int dtype that can hold ``maxval``.
"""
if maxval < 1:
# lim x -> 0 log2(x) == -infinity so we floor at uint8
return np.uint8
else:
# The number of bits required to hold the codes up to ``length`` is
# log2(length). The number of bits per bytes is 8. We cannot have
# fractional bytes so we need to round up. Finally, we can only have
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
# next value by looking up the next largest size in ``_int_sizes``.
return unsigned_int_dtype_with_size_in_bytes(
_int_sizes[int(np.ceil(log2(maxval) / 8))]
)
ctypedef fused unsigned_integral:
np.uint8_t
np.uint16_t
@@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values,
raise ValueError('nvalues larger than uint64')
length = len(categories_array)
if length < 1:
# lim x -> 0 log2(x) == -infinity so we floor at uint8
narrowest_dtype = np.uint8
else:
# The number of bits required to hold the codes up to ``length`` is
# log2(length). The number of bits per bytes is 8. We cannot have
# fractional bytes so we need to round up. Finally, we can only have
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
# next value by looking up the next largest size in ``_int_sizes``.
narrowest_dtype = unsigned_int_dtype_with_size_in_bytes(
_int_sizes[int(np.ceil(log2(length) / 8))]
)
narrowest_dtype = smallest_uint_that_can_hold(length)
if codes.dtype != narrowest_dtype:
# condense the codes down to the narrowest dtype possible
codes = codes.astype(narrowest_dtype)
+93 -1
View File
@@ -1,7 +1,7 @@
"""
An ndarray subclass for working with arrays of strings.
"""
from functools import partial
from functools import partial, total_ordering
from operator import eq, ne
import re
@@ -11,6 +11,7 @@ import pandas as pd
from toolz import compose
from zipline.utils.compat import unicode
from zipline.utils.functional import instance
from zipline.utils.preprocess import preprocess
from zipline.utils.sentinel import sentinel
from zipline.utils.input_validation import (
@@ -29,6 +30,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
from ._factorize import (
factorize_strings,
factorize_strings_known_categories,
smallest_uint_that_can_hold,
)
@@ -136,6 +138,7 @@ class LabelArray(ndarray):
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
"""
SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)
@preprocess(
values=coerce(list, partial(np.asarray, dtype=object)),
@@ -565,6 +568,83 @@ class LabelArray(ndarray):
# locations in our indices.
return results[self.as_int_array()]
def map(self, f):
"""
Map a function from str -> str element-wise over ``self``.
``f`` will be applied exactly once to each non-missing unique value in
``self``. Missing values will always map to ``self.missing_value``.
"""
# f() should only return None if None is our missing value.
if self.missing_value is None:
allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
else:
allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES
def f_to_use(x,
missing_value=self.missing_value,
otypes=allowed_outtypes):
# Don't call f on the missing value; those locations don't exist
# semantically. We return _sortable_sentinel rather than None
# because the np.unique call below sorts the categories array,
# which raises an error on Python 3 because None and str aren't
# comparable.
if x == missing_value:
return _sortable_sentinel
ret = f(x)
if not isinstance(ret, otypes):
raise TypeError(
"LabelArray.map expected function {f} to return a string"
" or None, but got {type} instead.\n"
"Value was {value}.".format(
f=f.__name__,
type=type(ret).__name__,
value=ret,
)
)
if ret == missing_value:
return _sortable_sentinel
return ret
new_categories_with_duplicates = (
np.vectorize(f_to_use, otypes=[object])(self.categories)
)
# If f() maps multiple inputs to the same output, then we can end up
# with the same code duplicated multiple times. Compress the categories
# by running them through np.unique, and then use the reverse lookup
# table to compress codes as well.
new_categories, bloated_inverse_index = np.unique(
new_categories_with_duplicates,
return_inverse=True
)
if new_categories[0] is _sortable_sentinel:
# f_to_use return _sortable_sentinel for locations that should be
# missing values in our output. Since np.unique returns the uniques
# in sorted order, and since _sortable_sentinel sorts before any
# string, we only need to check the first array entry.
new_categories[0] = self.missing_value
# `reverse_index` will always be a 64 bit integer even if we can hold a
# smaller array.
reverse_index = bloated_inverse_index.astype(
smallest_uint_that_can_hold(len(new_categories))
)
new_codes = np.take(reverse_index, self.as_int_array())
return self.from_codes_and_metadata(
new_codes,
new_categories,
dict(zip(new_categories, range(len(new_categories)))),
missing_value=self.missing_value,
)
def startswith(self, prefix):
"""
Element-wise startswith.
@@ -650,3 +730,15 @@ class LabelArray(ndarray):
element of self was an element of ``container``.
"""
return self.map_predicate(container.__contains__)
@instance # This makes _sortable_sentinel a singleton instance.
@total_ordering
class _sortable_sentinel(object):
"""Dummy object that sorts before any other python object.
"""
def __eq__(self, other):
return self is other
def __lt__(self, other):
return True
+64 -2
View File
@@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype
from zipline.pipeline.sentinels import NotSpecified
from zipline.pipeline.term import ComputableTerm
from zipline.utils.compat import unicode
from zipline.utils.input_validation import expect_types
from zipline.utils.input_validation import expect_types, expect_dtypes
from zipline.utils.memoize import classlazyval
from zipline.utils.numpy_utils import (
categorical_dtype,
@@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype(
dtype=categorical_dtype,
message_template=(
"{method_name}() is only defined on Classifiers producing strings"
" but it was called on a Factor of dtype {received_dtype}."
" but it was called on a Classifier of dtype {received_dtype}."
)
)
@@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
opargs=(pattern,),
)
# TODO: Support relabeling for integer dtypes.
@string_classifiers_only
def relabel(self, relabeler):
"""
Convert ``self`` into a new classifier by mapping a function over each
element produced by ``self``.
Parameters
----------
relabeler : function[str -> str or None]
A function to apply to each unique value produced by ``self``.
Returns
-------
relabeled : Classifier
A classifier produced by applying ``relabeler`` to each unique
value produced by ``self``.
"""
return Relabel(term=self, relabeler=relabeler)
def element_of(self, choices):
"""
Construct a Filter indicating whether values are in ``choices``.
@@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier):
return type(self).__name__ + '(%d)' % self.params['bins']
class Relabel(SingleInputMixin, Classifier):
"""
A classifier applying a relabeling function on the result of another
classifier.
Parameters
----------
arg : zipline.pipeline.Classifier
Term produceing the input to be relabeled.
relabel_func : function(LabelArray) -> LabelArray
Function to apply to the result of `term`.
"""
window_length = 0
params = ('relabeler',)
# TODO: Support relabeling for integer dtypes.
@expect_dtypes(term=categorical_dtype)
@expect_types(term=Classifier)
def __new__(cls, term, relabeler):
return super(Relabel, cls).__new__(
cls,
inputs=(term,),
dtype=term.dtype,
mask=term.mask,
relabeler=relabeler,
)
def _compute(self, arrays, dates, assets, mask):
relabeler = self.params['relabeler']
data = arrays[0]
if isinstance(data, LabelArray):
result = data.map(relabeler)
result[~mask] = data.missing_value
else:
raise NotImplementedError(
"Relabeling is not currently supported for "
"int-dtype classifiers."
)
return result
class CustomClassifier(PositiveWindowLengthMixin,
StandardOutputs,
CustomTermMixin,
+3 -14
View File
@@ -432,6 +432,7 @@ class ArrayPredicate(SingleInputMixin, Filter):
opargs : tuple[hashable]
Additional argument to apply to ``op``.
"""
params = ('op', 'opargs')
window_length = 0
@expect_types(term=Term, opargs=tuple)
@@ -445,22 +446,10 @@ class ArrayPredicate(SingleInputMixin, Filter):
mask=term.mask,
)
def _init(self, op, opargs, *args, **kwargs):
self._op = op
self._opargs = opargs
return super(ArrayPredicate, self)._init(*args, **kwargs)
@classmethod
def _static_identity(cls, op, opargs, *args, **kwargs):
return (
super(ArrayPredicate, cls)._static_identity(*args, **kwargs),
op,
opargs,
)
def _compute(self, arrays, dates, assets, mask):
params = self.params
data = arrays[0]
return self._op(data, *self._opargs) & mask
return params['op'](data, *params['opargs']) & mask
class Latest(LatestMixin, CustomFilter):
+1
View File
@@ -229,6 +229,7 @@ class Pipeline(object):
raise AssertionError("Unknown graph format %r." % format)
@staticmethod
@expect_types(term=Term, column_name=str)
def validate_column(column_name, term):
if term.ndim == 1:
raise UnsupportedPipelineOutput(column_name=column_name, term=term)