mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-30 21:27:10 +08:00
Merge pull request #1672 from quantopian/narrow-labelarray
narrow labelarray
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
from itertools import product
|
||||
from operator import eq, ne
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from toolz import take
|
||||
|
||||
from zipline.lib.labelarray import LabelArray
|
||||
from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
|
||||
from zipline.testing.predicates import assert_equal
|
||||
from zipline.utils.compat import unicode
|
||||
|
||||
|
||||
@@ -337,3 +340,85 @@ class LabelArrayTestCase(ZiplineTestCase):
|
||||
# Write the whole array.
|
||||
arr[:] = orig_arr
|
||||
check_arrays(arr, orig_arr)
|
||||
|
||||
def test_narrow_code_storage(self):
|
||||
def check_roundtrip(arr):
|
||||
assert_equal(
|
||||
arr.as_string_array(),
|
||||
LabelArray(
|
||||
arr.as_string_array(),
|
||||
arr.missing_value,
|
||||
).as_string_array(),
|
||||
)
|
||||
|
||||
def create_categories(width, plus_one):
|
||||
length = int(width / 8) + plus_one
|
||||
return [
|
||||
''.join(cs)
|
||||
for cs in take(
|
||||
2 ** width + plus_one,
|
||||
product([chr(c) for c in range(256)], repeat=length),
|
||||
)
|
||||
]
|
||||
|
||||
# uint8
|
||||
categories = create_categories(8, plus_one=False)
|
||||
arr = LabelArray(
|
||||
[],
|
||||
missing_value=categories[0],
|
||||
categories=categories,
|
||||
)
|
||||
self.assertEqual(arr.itemsize, 1)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# uint8 inference
|
||||
arr = LabelArray(categories, missing_value=categories[0])
|
||||
self.assertEqual(arr.itemsize, 1)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# just over uint8
|
||||
categories = create_categories(8, plus_one=True)
|
||||
arr = LabelArray(
|
||||
[],
|
||||
missing_value=categories[0],
|
||||
categories=categories,
|
||||
)
|
||||
self.assertEqual(arr.itemsize, 2)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# uint16 inference
|
||||
arr = LabelArray(categories, missing_value=categories[0])
|
||||
self.assertEqual(arr.itemsize, 2)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# fits in uint16
|
||||
categories = create_categories(16, plus_one=False)
|
||||
arr = LabelArray(
|
||||
[], missing_value=categories[0],
|
||||
categories=categories,
|
||||
)
|
||||
self.assertEqual(arr.itemsize, 2)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# uint16 inference
|
||||
arr = LabelArray(categories, missing_value=categories[0])
|
||||
self.assertEqual(arr.itemsize, 2)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# just over uint16
|
||||
categories = create_categories(16, plus_one=True)
|
||||
arr = LabelArray(
|
||||
[],
|
||||
missing_value=categories[0],
|
||||
categories=categories,
|
||||
)
|
||||
self.assertEqual(arr.itemsize, 4)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# uint32 inference
|
||||
arr = LabelArray(categories, missing_value=categories[0])
|
||||
self.assertEqual(arr.itemsize, 4)
|
||||
check_roundtrip(arr)
|
||||
|
||||
# NOTE: we could do this for 32 and 64; however, no one has enough RAM
|
||||
# or time for that.
|
||||
|
||||
+180
-75
@@ -1,53 +1,150 @@
|
||||
"""
|
||||
Factorization algorithms.
|
||||
"""
|
||||
from numpy cimport ndarray, int64_t, PyArray_Check, import_array
|
||||
from numpy import arange, asarray, empty, int64, isnan, ndarray, zeros
|
||||
from libc.math cimport floor, log
|
||||
cimport numpy as np
|
||||
import numpy as np
|
||||
|
||||
import_array()
|
||||
from zipline.utils.numpy_utils import unsigned_int_dtype_with_size_in_bytes
|
||||
|
||||
np.import_array()
|
||||
|
||||
|
||||
cpdef factorize_strings_known_categories(ndarray[object] values,
|
||||
list categories,
|
||||
object missing_value,
|
||||
int sort):
|
||||
"""
|
||||
Factorize an array whose categories are already known.
|
||||
cdef inline double log2(double d):
|
||||
return log(d) / log(2);
|
||||
|
||||
Any entries not in the specified categories will be given the code for
|
||||
`missing_value`.
|
||||
"""
|
||||
|
||||
ctypedef fused unsigned_integral:
|
||||
np.uint8_t
|
||||
np.uint16_t
|
||||
np.uint32_t
|
||||
np.uint64_t
|
||||
|
||||
|
||||
cdef factorize_strings_known_impl(np.ndarray[object] values,
|
||||
Py_ssize_t nvalues,
|
||||
list categories,
|
||||
object missing_value,
|
||||
bint sort,
|
||||
np.ndarray[unsigned_integral] codes):
|
||||
if missing_value not in categories:
|
||||
categories.insert(0, missing_value)
|
||||
|
||||
if sort:
|
||||
categories = sorted(categories)
|
||||
|
||||
cdef:
|
||||
Py_ssize_t nvalues = len(values)
|
||||
dict reverse_categories = dict(
|
||||
zip(categories, range(len(categories)))
|
||||
)
|
||||
|
||||
if not nvalues:
|
||||
return (
|
||||
asarray([], dtype=int64),
|
||||
asarray(categories, dtype=object),
|
||||
reverse_categories,
|
||||
)
|
||||
|
||||
cdef:
|
||||
Py_ssize_t i
|
||||
Py_ssize_t missing_code = reverse_categories[missing_value]
|
||||
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
|
||||
cdef dict reverse_categories = dict(
|
||||
zip(categories, range(len(categories)))
|
||||
)
|
||||
cdef Py_ssize_t i
|
||||
cdef Py_ssize_t missing_code = reverse_categories[missing_value]
|
||||
|
||||
for i in range(nvalues):
|
||||
codes[i] = reverse_categories.get(values[i], missing_code)
|
||||
|
||||
return codes, asarray(categories, dtype=object), reverse_categories
|
||||
return codes, np.asarray(categories, dtype=object), reverse_categories
|
||||
|
||||
|
||||
cpdef factorize_strings(ndarray[object] values,
|
||||
cpdef factorize_strings_known_categories(np.ndarray[object] values,
|
||||
list categories,
|
||||
object missing_value,
|
||||
bint sort):
|
||||
"""
|
||||
Factorize an array whose categories are already known.
|
||||
|
||||
Any entries not in the specified categories will be given the code for
|
||||
`missing_value`.
|
||||
"""
|
||||
cdef Py_ssize_t ncategories = len(categories)
|
||||
cdef Py_ssize_t nvalues = len(values)
|
||||
if ncategories <= 2 ** 8:
|
||||
return factorize_strings_known_impl[np.uint8_t](
|
||||
values,
|
||||
nvalues,
|
||||
categories,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, dtype=np.uint8)
|
||||
)
|
||||
elif ncategories <= 2 ** 16:
|
||||
return factorize_strings_known_impl[np.uint16_t](
|
||||
values,
|
||||
nvalues,
|
||||
categories,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, np.uint16),
|
||||
)
|
||||
elif ncategories <= 2 ** 32:
|
||||
return factorize_strings_known_impl[np.uint32_t](
|
||||
values,
|
||||
nvalues,
|
||||
categories,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, np.uint32),
|
||||
)
|
||||
elif ncategories <= 2 ** 64:
|
||||
return factorize_strings_known_impl[np.uint64_t](
|
||||
values,
|
||||
nvalues,
|
||||
categories,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, np.uint64),
|
||||
)
|
||||
else:
|
||||
raise ValueError('ncategories larger than uint64')
|
||||
|
||||
|
||||
cdef factorize_strings_impl(np.ndarray[object] values,
|
||||
Py_ssize_t nvalues,
|
||||
object missing_value,
|
||||
bint sort,
|
||||
np.ndarray[unsigned_integral] codes):
|
||||
cdef list categories = [missing_value]
|
||||
cdef dict reverse_categories = {missing_value: 0}
|
||||
|
||||
cdef Py_ssize_t i, code
|
||||
cdef object key = None
|
||||
|
||||
for i in range(nvalues):
|
||||
key = values[i]
|
||||
code = reverse_categories.get(key, -1)
|
||||
if code == -1:
|
||||
# Assign new code.
|
||||
code = len(reverse_categories)
|
||||
reverse_categories[key] = code
|
||||
categories.append(key)
|
||||
codes[i] = code
|
||||
|
||||
cdef np.ndarray[np.int64_t, ndim=1] sorter
|
||||
cdef np.ndarray[unsigned_integral, ndim=1] reverse_indexer
|
||||
cdef int ncategories
|
||||
cdef np.ndarray[object] categories_array = np.asarray(
|
||||
categories,
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
if sort:
|
||||
# This is all adapted from pandas.core.algorithms.factorize.
|
||||
ncategories = len(categories_array)
|
||||
sorter = np.zeros(ncategories, dtype=np.int64)
|
||||
|
||||
# Don't include missing_value in the argsort, because None is
|
||||
# unorderable with bytes/str in py3. Always just sort it to 0.
|
||||
sorter[1:] = categories_array[1:].argsort() + 1
|
||||
reverse_indexer = np.empty(ncategories, dtype=codes.dtype)
|
||||
reverse_indexer.put(sorter, np.arange(ncategories))
|
||||
|
||||
codes = reverse_indexer.take(codes)
|
||||
categories_array = categories_array.take(sorter)
|
||||
reverse_categories = dict(zip(categories_array, range(ncategories)))
|
||||
|
||||
return codes, categories_array, reverse_categories
|
||||
|
||||
|
||||
cpdef factorize_strings(np.ndarray[object] values,
|
||||
object missing_value,
|
||||
int sort):
|
||||
"""
|
||||
@@ -63,52 +160,60 @@ cpdef factorize_strings(ndarray[object] values,
|
||||
ObjectVector. Python's builtin data structures are **really**
|
||||
well-optimized.)
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t nvalues = len(values)
|
||||
list categories = [missing_value]
|
||||
dict reverse_categories = {missing_value: 0}
|
||||
cdef Py_ssize_t nvalues = len(values)
|
||||
cdef np.ndarray codes
|
||||
cdef np.ndarray categories_array
|
||||
cdef dict reverse_categories
|
||||
|
||||
# Short circuit on empty array.
|
||||
if not nvalues:
|
||||
return (
|
||||
asarray([], dtype=int64),
|
||||
asarray(categories, dtype=object),
|
||||
reverse_categories,
|
||||
if nvalues <= 2 ** 8:
|
||||
# we won't try to shrink because the ``codes`` array cannot get any
|
||||
# smaller
|
||||
return factorize_strings_impl[np.uint8_t](
|
||||
values,
|
||||
nvalues,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, dtype=np.uint8)
|
||||
)
|
||||
elif nvalues <= 2 ** 16:
|
||||
(codes,
|
||||
categories_array,
|
||||
reverse_categories) = factorize_strings_impl[np.uint16_t](
|
||||
values,
|
||||
nvalues,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, np.uint16),
|
||||
)
|
||||
elif nvalues <= 2 ** 32:
|
||||
(codes,
|
||||
categories_array,
|
||||
reverse_categories) = factorize_strings_impl[np.uint32_t](
|
||||
values,
|
||||
nvalues,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, np.uint32),
|
||||
)
|
||||
elif nvalues <= 2 ** 64:
|
||||
(codes,
|
||||
categories_array,
|
||||
reverse_categories) = factorize_strings_impl[np.uint64_t](
|
||||
values,
|
||||
nvalues,
|
||||
missing_value,
|
||||
sort,
|
||||
np.empty(nvalues, np.uint64),
|
||||
)
|
||||
else:
|
||||
# unreachable
|
||||
raise ValueError('nvalues larger than uint64')
|
||||
|
||||
cdef:
|
||||
Py_ssize_t i, code
|
||||
object key = None
|
||||
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
|
||||
|
||||
for i in range(nvalues):
|
||||
key = values[i]
|
||||
code = reverse_categories.get(key, -1)
|
||||
if code == -1:
|
||||
# Assign new code.
|
||||
code = len(reverse_categories)
|
||||
reverse_categories[key] = code
|
||||
categories.append(key)
|
||||
codes[i] = code
|
||||
|
||||
cdef ndarray[int64_t, ndim=1] sorter
|
||||
cdef ndarray[int64_t, ndim=1] reverse_indexer
|
||||
cdef int ncategories
|
||||
cdef ndarray[object] categories_array = asarray(categories, dtype=object)
|
||||
|
||||
if sort:
|
||||
# This is all adapted from pandas.core.algorithms.factorize.
|
||||
ncategories = len(categories_array)
|
||||
sorter = zeros(ncategories, dtype=int64)
|
||||
|
||||
# Don't include missing_value in the argsort, because None is
|
||||
# unorderable with bytes/str in py3. Always just sort it to 0.
|
||||
sorter[1:] = categories_array[1:].argsort() + 1
|
||||
reverse_indexer = empty(ncategories, dtype=int64)
|
||||
reverse_indexer.put(sorter, arange(ncategories))
|
||||
|
||||
codes = reverse_indexer.take(codes)
|
||||
categories_array = categories_array.take(sorter)
|
||||
reverse_categories = dict(zip(categories_array, range(ncategories)))
|
||||
if len(categories_array) < 2 ** codes.dtype.itemsize:
|
||||
# if there are a lot of duplicates in the values we may need to shrink
|
||||
# the width of the ``codes`` array
|
||||
codes = codes.astype(unsigned_int_dtype_with_size_in_bytes(
|
||||
floor(log2(len(categories_array))),
|
||||
))
|
||||
|
||||
return codes, categories_array, reverse_categories
|
||||
|
||||
+26
-12
@@ -21,7 +21,7 @@ from zipline.utils.input_validation import (
|
||||
)
|
||||
from zipline.utils.numpy_utils import (
|
||||
bool_dtype,
|
||||
int_dtype_with_size_in_bytes,
|
||||
unsigned_int_dtype_with_size_in_bytes,
|
||||
is_object,
|
||||
)
|
||||
from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
|
||||
@@ -81,6 +81,7 @@ class CategoryMismatch(ValueError):
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
_NotPassed = sentinel('_NotPassed')
|
||||
|
||||
|
||||
@@ -175,7 +176,7 @@ class LabelArray(ndarray):
|
||||
)
|
||||
categories.setflags(write=False)
|
||||
|
||||
return cls._from_codes_and_metadata(
|
||||
return cls.from_codes_and_metadata(
|
||||
codes=codes.reshape(values.shape),
|
||||
categories=categories,
|
||||
reverse_categories=reverse_categories,
|
||||
@@ -183,13 +184,24 @@ class LabelArray(ndarray):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _from_codes_and_metadata(cls,
|
||||
codes,
|
||||
categories,
|
||||
reverse_categories,
|
||||
missing_value):
|
||||
def from_codes_and_metadata(cls,
|
||||
codes,
|
||||
categories,
|
||||
reverse_categories,
|
||||
missing_value):
|
||||
"""
|
||||
View codes as a LabelArray and set LabelArray metadata on the result.
|
||||
Rehydrate a LabelArray from the codes and metadata.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
codes : np.ndarray[integral]
|
||||
The codes for the label array.
|
||||
categories : np.ndarray[object]
|
||||
The unique string categories.
|
||||
reverse_categories : dict[str, int]
|
||||
The mapping from category to its code-index.
|
||||
missing_value : any
|
||||
The value used to represent missing data.
|
||||
"""
|
||||
ret = codes.view(type=cls, dtype=np.void)
|
||||
ret._categories = categories
|
||||
@@ -289,7 +301,7 @@ class LabelArray(ndarray):
|
||||
"""
|
||||
return self.view(
|
||||
type=ndarray,
|
||||
dtype=int_dtype_with_size_in_bytes(self.itemsize),
|
||||
dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize),
|
||||
)
|
||||
|
||||
def as_string_array(self):
|
||||
@@ -384,7 +396,9 @@ class LabelArray(ndarray):
|
||||
|
||||
# Result is a scalar value, which will be an instance of np.void.
|
||||
# Map it back to one of our category entries.
|
||||
index = result.view(int_dtype_with_size_in_bytes(self.itemsize))
|
||||
index = result.view(
|
||||
unsigned_int_dtype_with_size_in_bytes(self.itemsize),
|
||||
)
|
||||
return self.categories[index]
|
||||
|
||||
def is_missing(self):
|
||||
@@ -514,11 +528,11 @@ class LabelArray(ndarray):
|
||||
Make an empty LabelArray with the same categories as ``self``, filled
|
||||
with ``self.missing_value``.
|
||||
"""
|
||||
return type(self)._from_codes_and_metadata(
|
||||
return type(self).from_codes_and_metadata(
|
||||
codes=np.full(
|
||||
shape,
|
||||
self.reverse_categories[self.missing_value],
|
||||
dtype=int_dtype_with_size_in_bytes(self.itemsize),
|
||||
dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize),
|
||||
),
|
||||
categories=self.categories,
|
||||
reverse_categories=self.reverse_categories,
|
||||
|
||||
@@ -87,6 +87,13 @@ INT_DTYPES_BY_SIZE_BYTES = OrderedDict([
|
||||
(8, dtype('int64')),
|
||||
])
|
||||
|
||||
UNSIGNED_INT_DTYPES_BY_SIZE_BYTES = OrderedDict([
|
||||
(1, dtype('uint8')),
|
||||
(2, dtype('uint16')),
|
||||
(4, dtype('uint32')),
|
||||
(8, dtype('uint64')),
|
||||
])
|
||||
|
||||
|
||||
def int_dtype_with_size_in_bytes(size):
|
||||
try:
|
||||
@@ -95,6 +102,15 @@ def int_dtype_with_size_in_bytes(size):
|
||||
raise ValueError("No integral dtype whose size is %d bytes." % size)
|
||||
|
||||
|
||||
def unsigned_int_dtype_with_size_in_bytes(size):
|
||||
try:
|
||||
return UNSIGNED_INT_DTYPES_BY_SIZE_BYTES[size]
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
"No unsigned integral dtype whose size is %d bytes." % size
|
||||
)
|
||||
|
||||
|
||||
class NoDefaultMissingValue(Exception):
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user