Merge pull request #1672 from quantopian/narrow-labelarray

narrow labelarray
This commit is contained in:
Joe Jevnik
2017-03-02 19:32:56 -05:00
committed by GitHub
4 changed files with 308 additions and 88 deletions
+86 -1
View File
@@ -1,10 +1,13 @@
from itertools import product
from operator import eq, ne
import numpy as np
import warnings
import numpy as np
from toolz import take
from zipline.lib.labelarray import LabelArray
from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
from zipline.testing.predicates import assert_equal
from zipline.utils.compat import unicode
@@ -337,3 +340,85 @@ class LabelArrayTestCase(ZiplineTestCase):
# Write the whole array.
arr[:] = orig_arr
check_arrays(arr, orig_arr)
def test_narrow_code_storage(self):
def check_roundtrip(arr):
assert_equal(
arr.as_string_array(),
LabelArray(
arr.as_string_array(),
arr.missing_value,
).as_string_array(),
)
def create_categories(width, plus_one):
length = int(width / 8) + plus_one
return [
''.join(cs)
for cs in take(
2 ** width + plus_one,
product([chr(c) for c in range(256)], repeat=length),
)
]
# uint8
categories = create_categories(8, plus_one=False)
arr = LabelArray(
[],
missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 1)
check_roundtrip(arr)
# uint8 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 1)
check_roundtrip(arr)
# just over uint8
categories = create_categories(8, plus_one=True)
arr = LabelArray(
[],
missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)
# uint16 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)
# fits in uint16
categories = create_categories(16, plus_one=False)
arr = LabelArray(
[], missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)
# uint16 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 2)
check_roundtrip(arr)
# just over uint16
categories = create_categories(16, plus_one=True)
arr = LabelArray(
[],
missing_value=categories[0],
categories=categories,
)
self.assertEqual(arr.itemsize, 4)
check_roundtrip(arr)
# uint32 inference
arr = LabelArray(categories, missing_value=categories[0])
self.assertEqual(arr.itemsize, 4)
check_roundtrip(arr)
# NOTE: we could do this for 32 and 64; however, no one has enough RAM
# or time for that.
+180 -75
View File
@@ -1,53 +1,150 @@
"""
Factorization algorithms.
"""
from numpy cimport ndarray, int64_t, PyArray_Check, import_array
from numpy import arange, asarray, empty, int64, isnan, ndarray, zeros
from libc.math cimport floor, log
cimport numpy as np
import numpy as np
import_array()
from zipline.utils.numpy_utils import unsigned_int_dtype_with_size_in_bytes
np.import_array()
cpdef factorize_strings_known_categories(ndarray[object] values,
list categories,
object missing_value,
int sort):
"""
Factorize an array whose categories are already known.
cdef inline double log2(double d):
return log(d) / log(2);
Any entries not in the specified categories will be given the code for
`missing_value`.
"""
ctypedef fused unsigned_integral:
np.uint8_t
np.uint16_t
np.uint32_t
np.uint64_t
cdef factorize_strings_known_impl(np.ndarray[object] values,
Py_ssize_t nvalues,
list categories,
object missing_value,
bint sort,
np.ndarray[unsigned_integral] codes):
if missing_value not in categories:
categories.insert(0, missing_value)
if sort:
categories = sorted(categories)
cdef:
Py_ssize_t nvalues = len(values)
dict reverse_categories = dict(
zip(categories, range(len(categories)))
)
if not nvalues:
return (
asarray([], dtype=int64),
asarray(categories, dtype=object),
reverse_categories,
)
cdef:
Py_ssize_t i
Py_ssize_t missing_code = reverse_categories[missing_value]
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
cdef dict reverse_categories = dict(
zip(categories, range(len(categories)))
)
cdef Py_ssize_t i
cdef Py_ssize_t missing_code = reverse_categories[missing_value]
for i in range(nvalues):
codes[i] = reverse_categories.get(values[i], missing_code)
return codes, asarray(categories, dtype=object), reverse_categories
return codes, np.asarray(categories, dtype=object), reverse_categories
cpdef factorize_strings(ndarray[object] values,
cpdef factorize_strings_known_categories(np.ndarray[object] values,
list categories,
object missing_value,
bint sort):
"""
Factorize an array whose categories are already known.
Any entries not in the specified categories will be given the code for
`missing_value`.
"""
cdef Py_ssize_t ncategories = len(categories)
cdef Py_ssize_t nvalues = len(values)
if ncategories <= 2 ** 8:
return factorize_strings_known_impl[np.uint8_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, dtype=np.uint8)
)
elif ncategories <= 2 ** 16:
return factorize_strings_known_impl[np.uint16_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, np.uint16),
)
elif ncategories <= 2 ** 32:
return factorize_strings_known_impl[np.uint32_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, np.uint32),
)
elif ncategories <= 2 ** 64:
return factorize_strings_known_impl[np.uint64_t](
values,
nvalues,
categories,
missing_value,
sort,
np.empty(nvalues, np.uint64),
)
else:
raise ValueError('ncategories larger than uint64')
cdef factorize_strings_impl(np.ndarray[object] values,
Py_ssize_t nvalues,
object missing_value,
bint sort,
np.ndarray[unsigned_integral] codes):
cdef list categories = [missing_value]
cdef dict reverse_categories = {missing_value: 0}
cdef Py_ssize_t i, code
cdef object key = None
for i in range(nvalues):
key = values[i]
code = reverse_categories.get(key, -1)
if code == -1:
# Assign new code.
code = len(reverse_categories)
reverse_categories[key] = code
categories.append(key)
codes[i] = code
cdef np.ndarray[np.int64_t, ndim=1] sorter
cdef np.ndarray[unsigned_integral, ndim=1] reverse_indexer
cdef int ncategories
cdef np.ndarray[object] categories_array = np.asarray(
categories,
dtype=object,
)
if sort:
# This is all adapted from pandas.core.algorithms.factorize.
ncategories = len(categories_array)
sorter = np.zeros(ncategories, dtype=np.int64)
# Don't include missing_value in the argsort, because None is
# unorderable with bytes/str in py3. Always just sort it to 0.
sorter[1:] = categories_array[1:].argsort() + 1
reverse_indexer = np.empty(ncategories, dtype=codes.dtype)
reverse_indexer.put(sorter, np.arange(ncategories))
codes = reverse_indexer.take(codes)
categories_array = categories_array.take(sorter)
reverse_categories = dict(zip(categories_array, range(ncategories)))
return codes, categories_array, reverse_categories
cpdef factorize_strings(np.ndarray[object] values,
object missing_value,
int sort):
"""
@@ -63,52 +160,60 @@ cpdef factorize_strings(ndarray[object] values,
ObjectVector. Python's builtin data structures are **really**
well-optimized.)
"""
cdef:
Py_ssize_t nvalues = len(values)
list categories = [missing_value]
dict reverse_categories = {missing_value: 0}
cdef Py_ssize_t nvalues = len(values)
cdef np.ndarray codes
cdef np.ndarray categories_array
cdef dict reverse_categories
# Short circuit on empty array.
if not nvalues:
return (
asarray([], dtype=int64),
asarray(categories, dtype=object),
reverse_categories,
if nvalues <= 2 ** 8:
# we won't try to shrink because the ``codes`` array cannot get any
# smaller
return factorize_strings_impl[np.uint8_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, dtype=np.uint8)
)
elif nvalues <= 2 ** 16:
(codes,
categories_array,
reverse_categories) = factorize_strings_impl[np.uint16_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, np.uint16),
)
elif nvalues <= 2 ** 32:
(codes,
categories_array,
reverse_categories) = factorize_strings_impl[np.uint32_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, np.uint32),
)
elif nvalues <= 2 ** 64:
(codes,
categories_array,
reverse_categories) = factorize_strings_impl[np.uint64_t](
values,
nvalues,
missing_value,
sort,
np.empty(nvalues, np.uint64),
)
else:
# unreachable
raise ValueError('nvalues larger than uint64')
cdef:
Py_ssize_t i, code
object key = None
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
for i in range(nvalues):
key = values[i]
code = reverse_categories.get(key, -1)
if code == -1:
# Assign new code.
code = len(reverse_categories)
reverse_categories[key] = code
categories.append(key)
codes[i] = code
cdef ndarray[int64_t, ndim=1] sorter
cdef ndarray[int64_t, ndim=1] reverse_indexer
cdef int ncategories
cdef ndarray[object] categories_array = asarray(categories, dtype=object)
if sort:
# This is all adapted from pandas.core.algorithms.factorize.
ncategories = len(categories_array)
sorter = zeros(ncategories, dtype=int64)
# Don't include missing_value in the argsort, because None is
# unorderable with bytes/str in py3. Always just sort it to 0.
sorter[1:] = categories_array[1:].argsort() + 1
reverse_indexer = empty(ncategories, dtype=int64)
reverse_indexer.put(sorter, arange(ncategories))
codes = reverse_indexer.take(codes)
categories_array = categories_array.take(sorter)
reverse_categories = dict(zip(categories_array, range(ncategories)))
if len(categories_array) < 2 ** codes.dtype.itemsize:
# if there are a lot of duplicates in the values we may need to shrink
# the width of the ``codes`` array
codes = codes.astype(unsigned_int_dtype_with_size_in_bytes(
floor(log2(len(categories_array))),
))
return codes, categories_array, reverse_categories
+26 -12
View File
@@ -21,7 +21,7 @@ from zipline.utils.input_validation import (
)
from zipline.utils.numpy_utils import (
bool_dtype,
int_dtype_with_size_in_bytes,
unsigned_int_dtype_with_size_in_bytes,
is_object,
)
from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
@@ -81,6 +81,7 @@ class CategoryMismatch(ValueError):
)
)
_NotPassed = sentinel('_NotPassed')
@@ -175,7 +176,7 @@ class LabelArray(ndarray):
)
categories.setflags(write=False)
return cls._from_codes_and_metadata(
return cls.from_codes_and_metadata(
codes=codes.reshape(values.shape),
categories=categories,
reverse_categories=reverse_categories,
@@ -183,13 +184,24 @@ class LabelArray(ndarray):
)
@classmethod
def _from_codes_and_metadata(cls,
codes,
categories,
reverse_categories,
missing_value):
def from_codes_and_metadata(cls,
codes,
categories,
reverse_categories,
missing_value):
"""
View codes as a LabelArray and set LabelArray metadata on the result.
Rehydrate a LabelArray from the codes and metadata.
Parameters
----------
codes : np.ndarray[integral]
The codes for the label array.
categories : np.ndarray[object]
The unique string categories.
reverse_categories : dict[str, int]
The mapping from category to its code-index.
missing_value : any
The value used to represent missing data.
"""
ret = codes.view(type=cls, dtype=np.void)
ret._categories = categories
@@ -289,7 +301,7 @@ class LabelArray(ndarray):
"""
return self.view(
type=ndarray,
dtype=int_dtype_with_size_in_bytes(self.itemsize),
dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize),
)
def as_string_array(self):
@@ -384,7 +396,9 @@ class LabelArray(ndarray):
# Result is a scalar value, which will be an instance of np.void.
# Map it back to one of our category entries.
index = result.view(int_dtype_with_size_in_bytes(self.itemsize))
index = result.view(
unsigned_int_dtype_with_size_in_bytes(self.itemsize),
)
return self.categories[index]
def is_missing(self):
@@ -514,11 +528,11 @@ class LabelArray(ndarray):
Make an empty LabelArray with the same categories as ``self``, filled
with ``self.missing_value``.
"""
return type(self)._from_codes_and_metadata(
return type(self).from_codes_and_metadata(
codes=np.full(
shape,
self.reverse_categories[self.missing_value],
dtype=int_dtype_with_size_in_bytes(self.itemsize),
dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize),
),
categories=self.categories,
reverse_categories=self.reverse_categories,
+16
View File
@@ -87,6 +87,13 @@ INT_DTYPES_BY_SIZE_BYTES = OrderedDict([
(8, dtype('int64')),
])
UNSIGNED_INT_DTYPES_BY_SIZE_BYTES = OrderedDict([
(1, dtype('uint8')),
(2, dtype('uint16')),
(4, dtype('uint32')),
(8, dtype('uint64')),
])
def int_dtype_with_size_in_bytes(size):
try:
@@ -95,6 +102,15 @@ def int_dtype_with_size_in_bytes(size):
raise ValueError("No integral dtype whose size is %d bytes." % size)
def unsigned_int_dtype_with_size_in_bytes(size):
try:
return UNSIGNED_INT_DTYPES_BY_SIZE_BYTES[size]
except KeyError:
raise ValueError(
"No unsigned integral dtype whose size is %d bytes." % size
)
class NoDefaultMissingValue(Exception):
pass