diff --git a/tests/test_labelarray.py b/tests/test_labelarray.py index 64004fe9..c08a58e4 100644 --- a/tests/test_labelarray.py +++ b/tests/test_labelarray.py @@ -1,10 +1,13 @@ from itertools import product from operator import eq, ne -import numpy as np import warnings +import numpy as np +from toolz import take + from zipline.lib.labelarray import LabelArray from zipline.testing import check_arrays, parameter_space, ZiplineTestCase +from zipline.testing.predicates import assert_equal from zipline.utils.compat import unicode @@ -337,3 +340,85 @@ class LabelArrayTestCase(ZiplineTestCase): # Write the whole array. arr[:] = orig_arr check_arrays(arr, orig_arr) + + def test_narrow_code_storage(self): + def check_roundtrip(arr): + assert_equal( + arr.as_string_array(), + LabelArray( + arr.as_string_array(), + arr.missing_value, + ).as_string_array(), + ) + + def create_categories(width, plus_one): + length = int(width / 8) + plus_one + return [ + ''.join(cs) + for cs in take( + 2 ** width + plus_one, + product([chr(c) for c in range(256)], repeat=length), + ) + ] + + # uint8 + categories = create_categories(8, plus_one=False) + arr = LabelArray( + [], + missing_value=categories[0], + categories=categories, + ) + self.assertEqual(arr.itemsize, 1) + check_roundtrip(arr) + + # uint8 inference + arr = LabelArray(categories, missing_value=categories[0]) + self.assertEqual(arr.itemsize, 1) + check_roundtrip(arr) + + # just over uint8 + categories = create_categories(8, plus_one=True) + arr = LabelArray( + [], + missing_value=categories[0], + categories=categories, + ) + self.assertEqual(arr.itemsize, 2) + check_roundtrip(arr) + + # uint16 inference + arr = LabelArray(categories, missing_value=categories[0]) + self.assertEqual(arr.itemsize, 2) + check_roundtrip(arr) + + # fits in uint16 + categories = create_categories(16, plus_one=False) + arr = LabelArray( + [], missing_value=categories[0], + categories=categories, + ) + self.assertEqual(arr.itemsize, 2) + check_roundtrip(arr) + + # uint16 inference + arr = LabelArray(categories, missing_value=categories[0]) + self.assertEqual(arr.itemsize, 2) + check_roundtrip(arr) + + # just over uint16 + categories = create_categories(16, plus_one=True) + arr = LabelArray( + [], + missing_value=categories[0], + categories=categories, + ) + self.assertEqual(arr.itemsize, 4) + check_roundtrip(arr) + + # uint32 inference + arr = LabelArray(categories, missing_value=categories[0]) + self.assertEqual(arr.itemsize, 4) + check_roundtrip(arr) + + # NOTE: we could do this for 32 and 64; however, no one has enough RAM + # or time for that. diff --git a/zipline/lib/_factorize.pyx b/zipline/lib/_factorize.pyx index 31cd0ff2..a90cec9f 100644 --- a/zipline/lib/_factorize.pyx +++ b/zipline/lib/_factorize.pyx @@ -1,53 +1,150 @@ """ Factorization algorithms. """ -from numpy cimport ndarray, int64_t, PyArray_Check, import_array -from numpy import arange, asarray, empty, int64, isnan, ndarray, zeros +from libc.math cimport floor, log +cimport numpy as np +import numpy as np -import_array() +from zipline.utils.numpy_utils import unsigned_int_dtype_with_size_in_bytes + +np.import_array() -cpdef factorize_strings_known_categories(ndarray[object] values, - list categories, - object missing_value, - int sort): - """ - Factorize an array whose categories are already known. +cdef inline double log2(double d): + return log(d) / log(2); - Any entries not in the specified categories will be given the code for - `missing_value`. - """ + +ctypedef fused unsigned_integral: + np.uint8_t + np.uint16_t + np.uint32_t + np.uint64_t + + +cdef factorize_strings_known_impl(np.ndarray[object] values, + Py_ssize_t nvalues, + list categories, + object missing_value, + bint sort, + np.ndarray[unsigned_integral] codes): if missing_value not in categories: categories.insert(0, missing_value) if sort: categories = sorted(categories) - cdef: - Py_ssize_t nvalues = len(values) - dict reverse_categories = dict( - zip(categories, range(len(categories))) - ) - - if not nvalues: - return ( - asarray([], dtype=int64), - asarray(categories, dtype=object), - reverse_categories, - ) - - cdef: - Py_ssize_t i - Py_ssize_t missing_code = reverse_categories[missing_value] - ndarray[int64_t] codes = empty(nvalues, dtype=int64) + cdef dict reverse_categories = dict( + zip(categories, range(len(categories))) + ) + cdef Py_ssize_t i + cdef Py_ssize_t missing_code = reverse_categories[missing_value] for i in range(nvalues): codes[i] = reverse_categories.get(values[i], missing_code) - return codes, asarray(categories, dtype=object), reverse_categories + return codes, np.asarray(categories, dtype=object), reverse_categories -cpdef factorize_strings(ndarray[object] values, +cpdef factorize_strings_known_categories(np.ndarray[object] values, + list categories, + object missing_value, + bint sort): + """ + Factorize an array whose categories are already known. + + Any entries not in the specified categories will be given the code for + `missing_value`. + """ + cdef Py_ssize_t ncategories = len(categories) + cdef Py_ssize_t nvalues = len(values) + if ncategories <= 2 ** 8: + return factorize_strings_known_impl[np.uint8_t]( + values, + nvalues, + categories, + missing_value, + sort, + np.empty(nvalues, dtype=np.uint8) + ) + elif ncategories <= 2 ** 16: + return factorize_strings_known_impl[np.uint16_t]( + values, + nvalues, + categories, + missing_value, + sort, + np.empty(nvalues, np.uint16), + ) + elif ncategories <= 2 ** 32: + return factorize_strings_known_impl[np.uint32_t]( + values, + nvalues, + categories, + missing_value, + sort, + np.empty(nvalues, np.uint32), + ) + elif ncategories <= 2 ** 64: + return factorize_strings_known_impl[np.uint64_t]( + values, + nvalues, + categories, + missing_value, + sort, + np.empty(nvalues, np.uint64), + ) + else: + raise ValueError('ncategories larger than uint64') + + +cdef factorize_strings_impl(np.ndarray[object] values, + Py_ssize_t nvalues, + object missing_value, + bint sort, + np.ndarray[unsigned_integral] codes): + cdef list categories = [missing_value] + cdef dict reverse_categories = {missing_value: 0} + + cdef Py_ssize_t i, code + cdef object key = None + + for i in range(nvalues): + key = values[i] + code = reverse_categories.get(key, -1) + if code == -1: + # Assign new code. + code = len(reverse_categories) + reverse_categories[key] = code + categories.append(key) + codes[i] = code + + cdef np.ndarray[np.int64_t, ndim=1] sorter + cdef np.ndarray[unsigned_integral, ndim=1] reverse_indexer + cdef int ncategories + cdef np.ndarray[object] categories_array = np.asarray( + categories, + dtype=object, + ) + + if sort: + # This is all adapted from pandas.core.algorithms.factorize. + ncategories = len(categories_array) + sorter = np.zeros(ncategories, dtype=np.int64) + + # Don't include missing_value in the argsort, because None is + # unorderable with bytes/str in py3. Always just sort it to 0. + sorter[1:] = categories_array[1:].argsort() + 1 + reverse_indexer = np.empty(ncategories, dtype=codes.dtype) + reverse_indexer.put(sorter, np.arange(ncategories)) + + codes = reverse_indexer.take(codes) + categories_array = categories_array.take(sorter) + reverse_categories = dict(zip(categories_array, range(ncategories))) + + return codes, categories_array, reverse_categories + + +cpdef factorize_strings(np.ndarray[object] values, object missing_value, int sort): """ @@ -63,52 +160,60 @@ cpdef factorize_strings(ndarray[object] values, ObjectVector. Python's builtin data structures are **really** well-optimized.) """ - cdef: - Py_ssize_t nvalues = len(values) - list categories = [missing_value] - dict reverse_categories = {missing_value: 0} + cdef Py_ssize_t nvalues = len(values) + cdef np.ndarray codes + cdef np.ndarray categories_array + cdef dict reverse_categories - # Short circuit on empty array. - if not nvalues: - return ( - asarray([], dtype=int64), - asarray(categories, dtype=object), - reverse_categories, + if nvalues <= 2 ** 8: + # we won't try to shrink because the ``codes`` array cannot get any + # smaller + return factorize_strings_impl[np.uint8_t]( + values, + nvalues, + missing_value, + sort, + np.empty(nvalues, dtype=np.uint8) ) + elif nvalues <= 2 ** 16: + (codes, + categories_array, + reverse_categories) = factorize_strings_impl[np.uint16_t]( + values, + nvalues, + missing_value, + sort, + np.empty(nvalues, np.uint16), + ) + elif nvalues <= 2 ** 32: + (codes, + categories_array, + reverse_categories) = factorize_strings_impl[np.uint32_t]( + values, + nvalues, + missing_value, + sort, + np.empty(nvalues, np.uint32), + ) + elif nvalues <= 2 ** 64: + (codes, + categories_array, + reverse_categories) = factorize_strings_impl[np.uint64_t]( + values, + nvalues, + missing_value, + sort, + np.empty(nvalues, np.uint64), + ) + else: + # unreachable + raise ValueError('nvalues larger than uint64') - cdef: - Py_ssize_t i, code - object key = None - ndarray[int64_t] codes = empty(nvalues, dtype=int64) - - for i in range(nvalues): - key = values[i] - code = reverse_categories.get(key, -1) - if code == -1: - # Assign new code. - code = len(reverse_categories) - reverse_categories[key] = code - categories.append(key) - codes[i] = code - - cdef ndarray[int64_t, ndim=1] sorter - cdef ndarray[int64_t, ndim=1] reverse_indexer - cdef int ncategories - cdef ndarray[object] categories_array = asarray(categories, dtype=object) - - if sort: - # This is all adapted from pandas.core.algorithms.factorize. - ncategories = len(categories_array) - sorter = zeros(ncategories, dtype=int64) - - # Don't include missing_value in the argsort, because None is - # unorderable with bytes/str in py3. Always just sort it to 0. - sorter[1:] = categories_array[1:].argsort() + 1 - reverse_indexer = empty(ncategories, dtype=int64) - reverse_indexer.put(sorter, arange(ncategories)) - - codes = reverse_indexer.take(codes) - categories_array = categories_array.take(sorter) - reverse_categories = dict(zip(categories_array, range(ncategories))) + if len(categories_array) < 2 ** codes.dtype.itemsize: + # if there are a lot of duplicates in the values we may need to shrink + # the width of the ``codes`` array + codes = codes.astype(unsigned_int_dtype_with_size_in_bytes( + floor(log2(len(categories_array))), + )) return codes, categories_array, reverse_categories diff --git a/zipline/lib/labelarray.py b/zipline/lib/labelarray.py index 1a469b43..c9db21e6 100644 --- a/zipline/lib/labelarray.py +++ b/zipline/lib/labelarray.py @@ -21,7 +21,7 @@ from zipline.utils.input_validation import ( ) from zipline.utils.numpy_utils import ( bool_dtype, - int_dtype_with_size_in_bytes, + unsigned_int_dtype_with_size_in_bytes, is_object, ) from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning @@ -81,6 +81,7 @@ class CategoryMismatch(ValueError): ) ) + _NotPassed = sentinel('_NotPassed') @@ -175,7 +176,7 @@ class LabelArray(ndarray): ) categories.setflags(write=False) - return cls._from_codes_and_metadata( + return cls.from_codes_and_metadata( codes=codes.reshape(values.shape), categories=categories, reverse_categories=reverse_categories, @@ -183,13 +184,24 @@ class LabelArray(ndarray): ) @classmethod - def _from_codes_and_metadata(cls, - codes, - categories, - reverse_categories, - missing_value): + def from_codes_and_metadata(cls, + codes, + categories, + reverse_categories, + missing_value): """ - View codes as a LabelArray and set LabelArray metadata on the result. + Rehydrate a LabelArray from the codes and metadata. + + Parameters + ---------- + codes : np.ndarray[integral] + The codes for the label array. + categories : np.ndarray[object] + The unique string categories. + reverse_categories : dict[str, int] + The mapping from category to its code-index. + missing_value : any + The value used to represent missing data. """ ret = codes.view(type=cls, dtype=np.void) ret._categories = categories @@ -289,7 +301,7 @@ class LabelArray(ndarray): """ return self.view( type=ndarray, - dtype=int_dtype_with_size_in_bytes(self.itemsize), + dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize), ) def as_string_array(self): @@ -384,7 +396,9 @@ class LabelArray(ndarray): # Result is a scalar value, which will be an instance of np.void. # Map it back to one of our category entries. - index = result.view(int_dtype_with_size_in_bytes(self.itemsize)) + index = result.view( + unsigned_int_dtype_with_size_in_bytes(self.itemsize), + ) return self.categories[index] def is_missing(self): @@ -514,11 +528,11 @@ class LabelArray(ndarray): Make an empty LabelArray with the same categories as ``self``, filled with ``self.missing_value``. """ - return type(self)._from_codes_and_metadata( + return type(self).from_codes_and_metadata( codes=np.full( shape, self.reverse_categories[self.missing_value], - dtype=int_dtype_with_size_in_bytes(self.itemsize), + dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize), ), categories=self.categories, reverse_categories=self.reverse_categories, diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py index 0a13b08f..9011df6c 100644 --- a/zipline/utils/numpy_utils.py +++ b/zipline/utils/numpy_utils.py @@ -87,6 +87,13 @@ INT_DTYPES_BY_SIZE_BYTES = OrderedDict([ (8, dtype('int64')), ]) +UNSIGNED_INT_DTYPES_BY_SIZE_BYTES = OrderedDict([ + (1, dtype('uint8')), + (2, dtype('uint16')), + (4, dtype('uint32')), + (8, dtype('uint64')), +]) + def int_dtype_with_size_in_bytes(size): try: @@ -95,6 +102,15 @@ def int_dtype_with_size_in_bytes(size): raise ValueError("No integral dtype whose size is %d bytes." % size) +def unsigned_int_dtype_with_size_in_bytes(size): + try: + return UNSIGNED_INT_DTYPES_BY_SIZE_BYTES[size] + except KeyError: + raise ValueError( + "No unsigned integral dtype whose size is %d bytes." % size + ) + + class NoDefaultMissingValue(Exception): pass