ENH: Add relabel method to string classifiers.

- Adds a `map` method to `LabelArray` that maps a unary function over
  the categories of a LabelArray, shrinking the underyling codes if
  possible.

- Adds a new `.relabel` method to string-dtype classifiers that maps a
  unary function over the unique elements of the underlying LabelArray.
  This is useful for things like cleaning noisy label data.
This commit is contained in:
Scott Sanderson
2017-06-07 11:50:59 -04:00
parent 8b17b7d4b3
commit e995e6f2ed
5 changed files with 332 additions and 15 deletions
+64
View File
@@ -468,6 +468,70 @@ class ClassifierTestCase(BasePipelineTestCase):
)
self.assertEqual(errmsg, expected)
@parameter_space(
__fail_fast=True,
labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
relabel_func=[
lambda s: s[0],
lambda s: str(len(s)),
lambda s: str(len([c for c in s if c == 'a'])),
lambda s: None,
]
)
def test_relabel_strings(self, relabel_func, labelarray_dtype):
class C(Classifier):
inputs = ()
dtype = categorical_dtype
missing_value = None
window_length = 0
c = C()
raw = np.asarray(
[['a', 'aa', 'aaa', 'abab'],
['bab', 'aba', 'aa', 'bb'],
['a', 'aba', 'abaa', 'abaab'],
['a', 'aa', 'aaa', 'aaaa']],
dtype=labelarray_dtype,
)
raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)
data = LabelArray(raw, missing_value=None)
terms = {
'relabeled': c.relabel(relabel_func),
}
expected_results = {
'relabeled': LabelArray(raw_relabeled, missing_value=None),
}
self.check_terms(
terms,
expected_results,
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)
def test_relabel_int_classifier_not_yet_supported(self):
class C(Classifier):
inputs = ()
dtype = int64_dtype
missing_value = -1
window_length = 0
c = C()
with self.assertRaises(TypeError) as e:
c.relabel(lambda x: 0 / 0) # Function should never be called.
result = str(e.exception)
expected = (
"relabel() is only defined on Classifiers producing strings "
"but it was called on a Classifier of dtype int64."
)
self.assertEqual(result, expected)
class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
def test_reversability_categorical(self):
+126
View File
@@ -109,6 +109,65 @@ class LabelArrayTestCase(ZiplineTestCase):
np_contains(strs) & notmissing,
)
@parameter_space(
__fail_fast=True,
f=[
lambda s: str(len(s)),
lambda s: s[0],
lambda s: ''.join(reversed(s)),
lambda s: '',
]
)
def test_map(self, f):
data = np.array(
[['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
dtype=object,
)
la = LabelArray(data, missing_value=None)
numpy_transformed = np.vectorize(f)(data)
la_transformed = la.map(f).as_string_array()
assert_equal(numpy_transformed, la_transformed)
def test_map_ignores_missing_value(self):
data = np.array(['A', 'B', 'C'], dtype=object)
la = LabelArray(data, missing_value='A')
def increment_char(c):
return chr(ord(c) + 1)
result = la.map(increment_char)
expected = LabelArray(['A', 'C', 'D'], missing_value='A')
assert_equal(result.as_string_array(), expected.as_string_array())
@parameter_space(
__fail_fast=True,
f=[
lambda s: 0,
lambda s: 0.0,
lambda s: object(),
]
)
def test_map_requires_f_to_return_a_string(self, f):
la = LabelArray(self.strs, missing_value=None)
with self.assertRaises(TypeError):
la.map(f)
def test_map_can_only_return_none_if_missing_value_is_none(self):
# Should work.
la = LabelArray(self.strs, missing_value=None)
la.map(lambda x: None)
la = LabelArray(self.strs, missing_value="__MISSING__")
with self.assertRaises(TypeError):
la.map(lambda x: None)
@parameter_space(
__fail_fast=True,
missing_value=('', 'a', 'not in the array', None),
@@ -436,6 +495,73 @@ class LabelArrayTestCase(ZiplineTestCase):
assert_equal(arr.itemsize, 2)
self.check_roundtrip(arr)
def test_map_shrinks_code_storage_if_possible(self):
arr = LabelArray(
# Drop the last value so we fit in a uint16 with None as a missing
# value.
self.create_categories(16, plus_one=False)[:-1],
missing_value=None,
)
self.assertEqual(arr.itemsize, 2)
def either_A_or_B(s):
return ('A', 'B')[sum(ord(c) for c in s) % 2]
result = arr.map(either_A_or_B)
self.assertEqual(set(result.categories), {'A', 'B', None})
self.assertEqual(result.itemsize, 1)
assert_equal(
np.vectorize(either_A_or_B)(arr.as_string_array()),
result.as_string_array(),
)
def test_map_never_increases_code_storage_size(self):
# This tests a pathological case where a user maps an impure function
# that returns a different label on every invocation, which in a naive
# implementation could cause us to need to **increase** the size of our
# codes after a map.
#
# This doesn't happen, however, because we guarantee that the user's
# mapping function will be called on each unique category exactly once,
# which means we can never increase the number of categories in the
# LabelArray after mapping.
# Using all but one of the categories so that we still fit in a uint8
# with an extra category for None as a missing value.
categories = self.create_categories(8, plus_one=False)[:-1]
larger_categories = self.create_categories(16, plus_one=False)
# Double the length of the categories so that we have to increase the
# required size after our map.
categories_twice = categories + categories
arr = LabelArray(categories_twice, missing_value=None)
assert_equal(arr.itemsize, 1)
gen_unique_categories = iter(larger_categories)
def new_string_every_time(c):
# Return a new unique category every time so that every result is
# different.
return next(gen_unique_categories)
result = arr.map(new_string_every_time)
# Result should still be of size 1.
assert_equal(result.itemsize, 1)
# Result should be the first `len(categories)` entries from the larger
# categories, repeated twice.
expected = LabelArray(
larger_categories[:len(categories)] * 2,
missing_value=None,
)
assert_equal(result.as_string_array(), expected.as_string_array())
def manual_narrow_condense_back_to_valid_size_slow(self):
"""This test is really slow so we don't want it run by default.
"""
+18 -13
View File
@@ -14,6 +14,23 @@ cdef inline double log2(double d):
return log(d) / log(2);
cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
"""Choose the smallest numpy unsigned int dtype that can hold ``size``.
"""
if maxval < 1:
# lim x -> 0 log2(x) == -infinity so we floor at uint8
return np.uint8
else:
# The number of bits required to hold the codes up to ``length`` is
# log2(length). The number of bits per bytes is 8. We cannot have
# fractional bytes so we need to round up. Finally, we can only have
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
# next value by looking up the next largest size in ``_int_sizes``.
return unsigned_int_dtype_with_size_in_bytes(
_int_sizes[int(np.ceil(log2(maxval) / 8))]
)
ctypedef fused unsigned_integral:
np.uint8_t
np.uint16_t
@@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values,
raise ValueError('nvalues larger than uint64')
length = len(categories_array)
if length < 1:
# lim x -> 0 log2(x) == -infinity so we floor at uint8
narrowest_dtype = np.uint8
else:
# The number of bits required to hold the codes up to ``length`` is
# log2(length). The number of bits per bytes is 8. We cannot have
# fractional bytes so we need to round up. Finally, we can only have
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
# next value by looking up the next largest size in ``_int_sizes``.
narrowest_dtype = unsigned_int_dtype_with_size_in_bytes(
_int_sizes[int(np.ceil(log2(length) / 8))]
)
narrowest_dtype = smallest_uint_that_can_hold(length)
if codes.dtype != narrowest_dtype:
# condense the codes down to the narrowest dtype possible
codes = codes.astype(narrowest_dtype)
+60
View File
@@ -29,6 +29,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
from ._factorize import (
factorize_strings,
factorize_strings_known_categories,
smallest_uint_that_can_hold,
)
@@ -136,6 +137,7 @@ class LabelArray(ndarray):
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
"""
SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)
@preprocess(
values=coerce(list, partial(np.asarray, dtype=object)),
@@ -565,6 +567,64 @@ class LabelArray(ndarray):
# locations in our indices.
return results[self.as_int_array()]
def map(self, f):
"""
Map a function from str -> str element-wise over ``self``.
``f`` will be applied exactly once to each non-missing unique value in
``self``. Missing values will always map to ``self.missing_value``.
"""
# f() should only return None if None is our missing value.
if self.missing_value is None:
allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
else:
allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES
def f_to_use(x,
missing_value=self.missing_value,
otypes=allowed_outtypes):
if x == missing_value:
return x
ret = f(x)
if not isinstance(ret, otypes):
raise TypeError(
"Expected f() to return a string. Got %s." % (
type(ret).__name__
)
)
return ret
new_categories_with_duplicates = (
np.vectorize(f_to_use, otypes=[object])(self.categories)
)
# If f() maps multiple inputs to the same output, then we can end up
# with the same code duplicated multiple times. Compress the categories
# by running them through np.unique, and then use the reverse lookup
# table to compress codes as well.
new_categories, bloated_reverse_index = np.unique(
new_categories_with_duplicates,
return_inverse=True
)
# `reverse_index` will always be a 64 bit integer even if we can hold a
# smaller array.
reverse_index = bloated_reverse_index.astype(
smallest_uint_that_can_hold(len(new_categories))
)
new_codes = np.take(reverse_index, self.as_int_array())
return self.from_codes_and_metadata(
new_codes,
new_categories,
dict(zip(new_categories, range(len(new_categories)))),
missing_value=self.missing_value,
)
def startswith(self, prefix):
"""
Element-wise startswith.
+64 -2
View File
@@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype
from zipline.pipeline.sentinels import NotSpecified
from zipline.pipeline.term import ComputableTerm
from zipline.utils.compat import unicode
from zipline.utils.input_validation import expect_types
from zipline.utils.input_validation import expect_types, expect_dtypes
from zipline.utils.memoize import classlazyval
from zipline.utils.numpy_utils import (
categorical_dtype,
@@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype(
dtype=categorical_dtype,
message_template=(
"{method_name}() is only defined on Classifiers producing strings"
" but it was called on a Factor of dtype {received_dtype}."
" but it was called on a Classifier of dtype {received_dtype}."
)
)
@@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
opargs=(pattern,),
)
# TODO: Support relabeling for integer dtypes.
@string_classifiers_only
def relabel(self, relabeler):
"""
Convert ``self`` into a new classifier by mapping a function over each
element produced by ``self``.
Parameters
----------
relabeler : function[str -> str or None]
A function to apply to each unique value produced by ``self``.
Returns
-------
relabeled : Classifier
A classifier produced by applying ``relabeler`` to each unique
value produced by ``self``.
"""
return Relabel(term=self, relabeler=relabeler)
def element_of(self, choices):
"""
Construct a Filter indicating whether values are in ``choices``.
@@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier):
return type(self).__name__ + '(%d)' % self.params['bins']
class Relabel(SingleInputMixin, Classifier):
"""
A classifier applying a relabeling function on the result of another
classifier.
Parameters
----------
arg : zipline.pipeline.Classifier
Term produceing the input to be relabeled.
relabel_func : function(LabelArray) -> LabelArray
Function to apply to the result of `term`.
"""
window_length = 0
params = ('relabeler',)
# TODO: Support relabeling for integer dtypes.
@expect_dtypes(term=categorical_dtype)
@expect_types(term=Classifier)
def __new__(cls, term, relabeler):
return super(Relabel, cls).__new__(
cls,
inputs=(term,),
dtype=term.dtype,
mask=term.mask,
relabeler=relabeler,
)
def _compute(self, arrays, dates, assets, mask):
relabeler = self.params['relabeler']
data = arrays[0]
if isinstance(data, LabelArray):
result = data.map(relabeler)
result[~mask] = data.missing_value
else:
raise TypeError(
"Relabeling is not currently supported for "
"int-dtype classifiers."
)
return result
class CustomClassifier(PositiveWindowLengthMixin,
StandardOutputs,
CustomTermMixin,