mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-30 04:17:29 +08:00
ENH: Add relabel method to string classifiers.
- Adds a `map` method to `LabelArray` that maps a unary function over the categories of a LabelArray, shrinking the underyling codes if possible. - Adds a new `.relabel` method to string-dtype classifiers that maps a unary function over the unique elements of the underlying LabelArray. This is useful for things like cleaning noisy label data.
This commit is contained in:
@@ -468,6 +468,70 @@ class ClassifierTestCase(BasePipelineTestCase):
|
||||
)
|
||||
self.assertEqual(errmsg, expected)
|
||||
|
||||
@parameter_space(
|
||||
__fail_fast=True,
|
||||
labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
|
||||
relabel_func=[
|
||||
lambda s: s[0],
|
||||
lambda s: str(len(s)),
|
||||
lambda s: str(len([c for c in s if c == 'a'])),
|
||||
lambda s: None,
|
||||
]
|
||||
)
|
||||
def test_relabel_strings(self, relabel_func, labelarray_dtype):
|
||||
|
||||
class C(Classifier):
|
||||
inputs = ()
|
||||
dtype = categorical_dtype
|
||||
missing_value = None
|
||||
window_length = 0
|
||||
|
||||
c = C()
|
||||
|
||||
raw = np.asarray(
|
||||
[['a', 'aa', 'aaa', 'abab'],
|
||||
['bab', 'aba', 'aa', 'bb'],
|
||||
['a', 'aba', 'abaa', 'abaab'],
|
||||
['a', 'aa', 'aaa', 'aaaa']],
|
||||
dtype=labelarray_dtype,
|
||||
)
|
||||
raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)
|
||||
|
||||
data = LabelArray(raw, missing_value=None)
|
||||
|
||||
terms = {
|
||||
'relabeled': c.relabel(relabel_func),
|
||||
}
|
||||
expected_results = {
|
||||
'relabeled': LabelArray(raw_relabeled, missing_value=None),
|
||||
}
|
||||
|
||||
self.check_terms(
|
||||
terms,
|
||||
expected_results,
|
||||
initial_workspace={c: data},
|
||||
mask=self.build_mask(self.ones_mask(shape=data.shape)),
|
||||
)
|
||||
|
||||
def test_relabel_int_classifier_not_yet_supported(self):
|
||||
class C(Classifier):
|
||||
inputs = ()
|
||||
dtype = int64_dtype
|
||||
missing_value = -1
|
||||
window_length = 0
|
||||
|
||||
c = C()
|
||||
|
||||
with self.assertRaises(TypeError) as e:
|
||||
c.relabel(lambda x: 0 / 0) # Function should never be called.
|
||||
|
||||
result = str(e.exception)
|
||||
expected = (
|
||||
"relabel() is only defined on Classifiers producing strings "
|
||||
"but it was called on a Classifier of dtype int64."
|
||||
)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
|
||||
class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
|
||||
def test_reversability_categorical(self):
|
||||
|
||||
@@ -109,6 +109,65 @@ class LabelArrayTestCase(ZiplineTestCase):
|
||||
np_contains(strs) & notmissing,
|
||||
)
|
||||
|
||||
@parameter_space(
|
||||
__fail_fast=True,
|
||||
f=[
|
||||
lambda s: str(len(s)),
|
||||
lambda s: s[0],
|
||||
lambda s: ''.join(reversed(s)),
|
||||
lambda s: '',
|
||||
]
|
||||
)
|
||||
def test_map(self, f):
|
||||
data = np.array(
|
||||
[['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
|
||||
['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
|
||||
['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
|
||||
['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
|
||||
dtype=object,
|
||||
)
|
||||
la = LabelArray(data, missing_value=None)
|
||||
|
||||
numpy_transformed = np.vectorize(f)(data)
|
||||
la_transformed = la.map(f).as_string_array()
|
||||
|
||||
assert_equal(numpy_transformed, la_transformed)
|
||||
|
||||
def test_map_ignores_missing_value(self):
|
||||
data = np.array(['A', 'B', 'C'], dtype=object)
|
||||
la = LabelArray(data, missing_value='A')
|
||||
|
||||
def increment_char(c):
|
||||
return chr(ord(c) + 1)
|
||||
|
||||
result = la.map(increment_char)
|
||||
expected = LabelArray(['A', 'C', 'D'], missing_value='A')
|
||||
assert_equal(result.as_string_array(), expected.as_string_array())
|
||||
|
||||
@parameter_space(
|
||||
__fail_fast=True,
|
||||
f=[
|
||||
lambda s: 0,
|
||||
lambda s: 0.0,
|
||||
lambda s: object(),
|
||||
]
|
||||
)
|
||||
def test_map_requires_f_to_return_a_string(self, f):
|
||||
la = LabelArray(self.strs, missing_value=None)
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
la.map(f)
|
||||
|
||||
def test_map_can_only_return_none_if_missing_value_is_none(self):
|
||||
|
||||
# Should work.
|
||||
la = LabelArray(self.strs, missing_value=None)
|
||||
la.map(lambda x: None)
|
||||
|
||||
la = LabelArray(self.strs, missing_value="__MISSING__")
|
||||
with self.assertRaises(TypeError):
|
||||
la.map(lambda x: None)
|
||||
|
||||
@parameter_space(
|
||||
__fail_fast=True,
|
||||
missing_value=('', 'a', 'not in the array', None),
|
||||
@@ -436,6 +495,73 @@ class LabelArrayTestCase(ZiplineTestCase):
|
||||
assert_equal(arr.itemsize, 2)
|
||||
self.check_roundtrip(arr)
|
||||
|
||||
def test_map_shrinks_code_storage_if_possible(self):
|
||||
arr = LabelArray(
|
||||
# Drop the last value so we fit in a uint16 with None as a missing
|
||||
# value.
|
||||
self.create_categories(16, plus_one=False)[:-1],
|
||||
missing_value=None,
|
||||
)
|
||||
|
||||
self.assertEqual(arr.itemsize, 2)
|
||||
|
||||
def either_A_or_B(s):
|
||||
return ('A', 'B')[sum(ord(c) for c in s) % 2]
|
||||
|
||||
result = arr.map(either_A_or_B)
|
||||
|
||||
self.assertEqual(set(result.categories), {'A', 'B', None})
|
||||
self.assertEqual(result.itemsize, 1)
|
||||
|
||||
assert_equal(
|
||||
np.vectorize(either_A_or_B)(arr.as_string_array()),
|
||||
result.as_string_array(),
|
||||
)
|
||||
|
||||
def test_map_never_increases_code_storage_size(self):
|
||||
# This tests a pathological case where a user maps an impure function
|
||||
# that returns a different label on every invocation, which in a naive
|
||||
# implementation could cause us to need to **increase** the size of our
|
||||
# codes after a map.
|
||||
#
|
||||
# This doesn't happen, however, because we guarantee that the user's
|
||||
# mapping function will be called on each unique category exactly once,
|
||||
# which means we can never increase the number of categories in the
|
||||
# LabelArray after mapping.
|
||||
|
||||
# Using all but one of the categories so that we still fit in a uint8
|
||||
# with an extra category for None as a missing value.
|
||||
categories = self.create_categories(8, plus_one=False)[:-1]
|
||||
|
||||
larger_categories = self.create_categories(16, plus_one=False)
|
||||
|
||||
# Double the length of the categories so that we have to increase the
|
||||
# required size after our map.
|
||||
categories_twice = categories + categories
|
||||
|
||||
arr = LabelArray(categories_twice, missing_value=None)
|
||||
assert_equal(arr.itemsize, 1)
|
||||
|
||||
gen_unique_categories = iter(larger_categories)
|
||||
|
||||
def new_string_every_time(c):
|
||||
# Return a new unique category every time so that every result is
|
||||
# different.
|
||||
return next(gen_unique_categories)
|
||||
|
||||
result = arr.map(new_string_every_time)
|
||||
|
||||
# Result should still be of size 1.
|
||||
assert_equal(result.itemsize, 1)
|
||||
|
||||
# Result should be the first `len(categories)` entries from the larger
|
||||
# categories, repeated twice.
|
||||
expected = LabelArray(
|
||||
larger_categories[:len(categories)] * 2,
|
||||
missing_value=None,
|
||||
)
|
||||
assert_equal(result.as_string_array(), expected.as_string_array())
|
||||
|
||||
def manual_narrow_condense_back_to_valid_size_slow(self):
|
||||
"""This test is really slow so we don't want it run by default.
|
||||
"""
|
||||
|
||||
+18
-13
@@ -14,6 +14,23 @@ cdef inline double log2(double d):
|
||||
return log(d) / log(2);
|
||||
|
||||
|
||||
cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
|
||||
"""Choose the smallest numpy unsigned int dtype that can hold ``size``.
|
||||
"""
|
||||
if maxval < 1:
|
||||
# lim x -> 0 log2(x) == -infinity so we floor at uint8
|
||||
return np.uint8
|
||||
else:
|
||||
# The number of bits required to hold the codes up to ``length`` is
|
||||
# log2(length). The number of bits per bytes is 8. We cannot have
|
||||
# fractional bytes so we need to round up. Finally, we can only have
|
||||
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
|
||||
# next value by looking up the next largest size in ``_int_sizes``.
|
||||
return unsigned_int_dtype_with_size_in_bytes(
|
||||
_int_sizes[int(np.ceil(log2(maxval) / 8))]
|
||||
)
|
||||
|
||||
|
||||
ctypedef fused unsigned_integral:
|
||||
np.uint8_t
|
||||
np.uint16_t
|
||||
@@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values,
|
||||
raise ValueError('nvalues larger than uint64')
|
||||
|
||||
length = len(categories_array)
|
||||
if length < 1:
|
||||
# lim x -> 0 log2(x) == -infinity so we floor at uint8
|
||||
narrowest_dtype = np.uint8
|
||||
else:
|
||||
# The number of bits required to hold the codes up to ``length`` is
|
||||
# log2(length). The number of bits per bytes is 8. We cannot have
|
||||
# fractional bytes so we need to round up. Finally, we can only have
|
||||
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
|
||||
# next value by looking up the next largest size in ``_int_sizes``.
|
||||
narrowest_dtype = unsigned_int_dtype_with_size_in_bytes(
|
||||
_int_sizes[int(np.ceil(log2(length) / 8))]
|
||||
)
|
||||
|
||||
narrowest_dtype = smallest_uint_that_can_hold(length)
|
||||
if codes.dtype != narrowest_dtype:
|
||||
# condense the codes down to the narrowest dtype possible
|
||||
codes = codes.astype(narrowest_dtype)
|
||||
|
||||
@@ -29,6 +29,7 @@ from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
|
||||
from ._factorize import (
|
||||
factorize_strings,
|
||||
factorize_strings_known_categories,
|
||||
smallest_uint_that_can_hold,
|
||||
)
|
||||
|
||||
|
||||
@@ -136,6 +137,7 @@ class LabelArray(ndarray):
|
||||
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
|
||||
"""
|
||||
SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
|
||||
SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)
|
||||
|
||||
@preprocess(
|
||||
values=coerce(list, partial(np.asarray, dtype=object)),
|
||||
@@ -565,6 +567,64 @@ class LabelArray(ndarray):
|
||||
# locations in our indices.
|
||||
return results[self.as_int_array()]
|
||||
|
||||
def map(self, f):
|
||||
"""
|
||||
Map a function from str -> str element-wise over ``self``.
|
||||
|
||||
``f`` will be applied exactly once to each non-missing unique value in
|
||||
``self``. Missing values will always map to ``self.missing_value``.
|
||||
"""
|
||||
# f() should only return None if None is our missing value.
|
||||
if self.missing_value is None:
|
||||
allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
|
||||
else:
|
||||
allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES
|
||||
|
||||
def f_to_use(x,
|
||||
missing_value=self.missing_value,
|
||||
otypes=allowed_outtypes):
|
||||
|
||||
if x == missing_value:
|
||||
return x
|
||||
|
||||
ret = f(x)
|
||||
|
||||
if not isinstance(ret, otypes):
|
||||
raise TypeError(
|
||||
"Expected f() to return a string. Got %s." % (
|
||||
type(ret).__name__
|
||||
)
|
||||
)
|
||||
|
||||
return ret
|
||||
|
||||
new_categories_with_duplicates = (
|
||||
np.vectorize(f_to_use, otypes=[object])(self.categories)
|
||||
)
|
||||
|
||||
# If f() maps multiple inputs to the same output, then we can end up
|
||||
# with the same code duplicated multiple times. Compress the categories
|
||||
# by running them through np.unique, and then use the reverse lookup
|
||||
# table to compress codes as well.
|
||||
new_categories, bloated_reverse_index = np.unique(
|
||||
new_categories_with_duplicates,
|
||||
return_inverse=True
|
||||
)
|
||||
|
||||
# `reverse_index` will always be a 64 bit integer even if we can hold a
|
||||
# smaller array.
|
||||
reverse_index = bloated_reverse_index.astype(
|
||||
smallest_uint_that_can_hold(len(new_categories))
|
||||
)
|
||||
new_codes = np.take(reverse_index, self.as_int_array())
|
||||
|
||||
return self.from_codes_and_metadata(
|
||||
new_codes,
|
||||
new_categories,
|
||||
dict(zip(new_categories, range(len(new_categories)))),
|
||||
missing_value=self.missing_value,
|
||||
)
|
||||
|
||||
def startswith(self, prefix):
|
||||
"""
|
||||
Element-wise startswith.
|
||||
|
||||
@@ -14,7 +14,7 @@ from zipline.pipeline.api_utils import restrict_to_dtype
|
||||
from zipline.pipeline.sentinels import NotSpecified
|
||||
from zipline.pipeline.term import ComputableTerm
|
||||
from zipline.utils.compat import unicode
|
||||
from zipline.utils.input_validation import expect_types
|
||||
from zipline.utils.input_validation import expect_types, expect_dtypes
|
||||
from zipline.utils.memoize import classlazyval
|
||||
from zipline.utils.numpy_utils import (
|
||||
categorical_dtype,
|
||||
@@ -39,7 +39,7 @@ string_classifiers_only = restrict_to_dtype(
|
||||
dtype=categorical_dtype,
|
||||
message_template=(
|
||||
"{method_name}() is only defined on Classifiers producing strings"
|
||||
" but it was called on a Factor of dtype {received_dtype}."
|
||||
" but it was called on a Classifier of dtype {received_dtype}."
|
||||
)
|
||||
)
|
||||
|
||||
@@ -224,6 +224,26 @@ class Classifier(RestrictedDTypeMixin, ComputableTerm):
|
||||
opargs=(pattern,),
|
||||
)
|
||||
|
||||
# TODO: Support relabeling for integer dtypes.
|
||||
@string_classifiers_only
|
||||
def relabel(self, relabeler):
|
||||
"""
|
||||
Convert ``self`` into a new classifier by mapping a function over each
|
||||
element produced by ``self``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
relabeler : function[str -> str or None]
|
||||
A function to apply to each unique value produced by ``self``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
relabeled : Classifier
|
||||
A classifier produced by applying ``relabeler`` to each unique
|
||||
value produced by ``self``.
|
||||
"""
|
||||
return Relabel(term=self, relabeler=relabeler)
|
||||
|
||||
def element_of(self, choices):
|
||||
"""
|
||||
Construct a Filter indicating whether values are in ``choices``.
|
||||
@@ -382,6 +402,48 @@ class Quantiles(SingleInputMixin, Classifier):
|
||||
return type(self).__name__ + '(%d)' % self.params['bins']
|
||||
|
||||
|
||||
class Relabel(SingleInputMixin, Classifier):
|
||||
"""
|
||||
A classifier applying a relabeling function on the result of another
|
||||
classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : zipline.pipeline.Classifier
|
||||
Term produceing the input to be relabeled.
|
||||
relabel_func : function(LabelArray) -> LabelArray
|
||||
Function to apply to the result of `term`.
|
||||
"""
|
||||
window_length = 0
|
||||
params = ('relabeler',)
|
||||
|
||||
# TODO: Support relabeling for integer dtypes.
|
||||
@expect_dtypes(term=categorical_dtype)
|
||||
@expect_types(term=Classifier)
|
||||
def __new__(cls, term, relabeler):
|
||||
return super(Relabel, cls).__new__(
|
||||
cls,
|
||||
inputs=(term,),
|
||||
dtype=term.dtype,
|
||||
mask=term.mask,
|
||||
relabeler=relabeler,
|
||||
)
|
||||
|
||||
def _compute(self, arrays, dates, assets, mask):
|
||||
relabeler = self.params['relabeler']
|
||||
data = arrays[0]
|
||||
|
||||
if isinstance(data, LabelArray):
|
||||
result = data.map(relabeler)
|
||||
result[~mask] = data.missing_value
|
||||
else:
|
||||
raise TypeError(
|
||||
"Relabeling is not currently supported for "
|
||||
"int-dtype classifiers."
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
class CustomClassifier(PositiveWindowLengthMixin,
|
||||
StandardOutputs,
|
||||
CustomTermMixin,
|
||||
|
||||
Reference in New Issue
Block a user