mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-30 19:06:50 +08:00
5f190395ad
- Adds a new class, ``LabelArray``, which is a subclass of np.ndarray. LabelArray is conceptually similar to pandas.Categorical, in that it stores data with many duplicate values as indices into an array of unique values. For string data with many duplicates (e.g. time-series of tickers or or industry classifications), this provides multiple orders of magnitude of improvement when doing string operations, especially string comparison/matching operations. - Adds a new generic object "specialization" for `AdjustedArrayWindow`, and a corresponding ObjectOverwrite adjustment. - Adds a new ``postprocess`` method to ``zipline.pipeline.term.Term``. This method is called on the final result of any pipeline expression after screen filtering has occurred. The default implementation of ``postprocess`` is identity, but Classifier overrides it to coerce string columns into pandas.Categoricals before presenting them to the user.
109 lines
3.5 KiB
Cython
109 lines
3.5 KiB
Cython
"""
|
|
Factorization algorithms.
|
|
"""
|
|
from numpy cimport ndarray, int64_t, PyArray_Check, import_array
|
|
from numpy import arange, asarray, empty, int64, isnan, ndarray
|
|
|
|
import_array()
|
|
|
|
|
|
cpdef factorize_strings_known_categories(ndarray[object] values,
|
|
list categories,
|
|
object missing_value,
|
|
int sort):
|
|
"""
|
|
Factorize an array whose categories are already known.
|
|
|
|
Any entries not in the specified categories will be given the code for
|
|
`missing_value`.
|
|
"""
|
|
if missing_value not in categories:
|
|
categories.append(missing_value)
|
|
|
|
if sort:
|
|
categories = sorted(categories)
|
|
|
|
cdef:
|
|
Py_ssize_t nvalues = len(values)
|
|
dict reverse_categories = dict(
|
|
zip(categories, range(len(categories)))
|
|
)
|
|
|
|
if not nvalues:
|
|
return (
|
|
asarray([], dtype=int64),
|
|
asarray(categories, dtype=object),
|
|
reverse_categories,
|
|
)
|
|
|
|
cdef:
|
|
Py_ssize_t i
|
|
Py_ssize_t missing_code = reverse_categories[missing_value]
|
|
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
|
|
|
|
for i in range(nvalues):
|
|
codes[i] = reverse_categories.get(values[i], missing_code)
|
|
|
|
return codes, asarray(categories, dtype=object), reverse_categories
|
|
|
|
cpdef factorize_strings(ndarray[object] values,
|
|
object missing_value,
|
|
int sort):
|
|
"""
|
|
Factorize an array of (possibly duplicated) labels into an array of indices
|
|
into a unique array of labels.
|
|
|
|
This is ~30% faster than pandas.factorize, at the cost of not having
|
|
special treatment for NaN, which we don't care about because we only
|
|
support arrays of strings.
|
|
|
|
(Though it's faster even if you throw in the nan checks that pandas does,
|
|
because we're using dict and list instead of PyObjectHashTable and
|
|
ObjectVector. Python's builtin data structures are **really**
|
|
well-optimized.)
|
|
"""
|
|
cdef:
|
|
Py_ssize_t nvalues = len(values)
|
|
list categories = [missing_value]
|
|
dict reverse_categories = {missing_value: 0}
|
|
|
|
# Short circuit on empty array.
|
|
if not nvalues:
|
|
return (
|
|
asarray([], dtype=int64),
|
|
asarray(categories, dtype=object),
|
|
reverse_categories,
|
|
)
|
|
|
|
cdef:
|
|
Py_ssize_t i, code
|
|
object key = None
|
|
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
|
|
|
|
for i in range(nvalues):
|
|
key = values[i]
|
|
code = reverse_categories.get(key, -1)
|
|
if code == -1:
|
|
# Assign new code.
|
|
code = len(reverse_categories)
|
|
reverse_categories[key] = code
|
|
categories.append(key)
|
|
codes[i] = code
|
|
|
|
cdef ndarray[int64_t, ndim=1] sorter
|
|
cdef ndarray[int64_t, ndim=1] reverse_indexer
|
|
cdef int ncategories
|
|
cdef ndarray[object] categories_array = asarray(categories, dtype=object)
|
|
if sort:
|
|
# This is all taken from pandas.core.algorithms.factorize.
|
|
ncategories = len(categories_array)
|
|
sorter = categories_array.argsort()
|
|
reverse_indexer = empty(ncategories, dtype=int64)
|
|
reverse_indexer.put(sorter, arange(ncategories))
|
|
|
|
codes = reverse_indexer.take(codes)
|
|
categories_array = categories_array.take(sorter)
|
|
reverse_categories = dict(zip(categories_array, range(ncategories)))
|
|
|
|
return codes, categories_array, reverse_categories
|