mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-29 00:37:43 +08:00
115 lines
3.7 KiB
Cython
115 lines
3.7 KiB
Cython
"""
|
|
Factorization algorithms.
|
|
"""
|
|
from numpy cimport ndarray, int64_t, PyArray_Check, import_array
|
|
from numpy import arange, asarray, empty, int64, isnan, ndarray, zeros
|
|
|
|
import_array()
|
|
|
|
|
|
cpdef factorize_strings_known_categories(ndarray[object] values,
|
|
list categories,
|
|
object missing_value,
|
|
int sort):
|
|
"""
|
|
Factorize an array whose categories are already known.
|
|
|
|
Any entries not in the specified categories will be given the code for
|
|
`missing_value`.
|
|
"""
|
|
if missing_value not in categories:
|
|
categories.insert(0, missing_value)
|
|
|
|
if sort:
|
|
categories = sorted(categories)
|
|
|
|
cdef:
|
|
Py_ssize_t nvalues = len(values)
|
|
dict reverse_categories = dict(
|
|
zip(categories, range(len(categories)))
|
|
)
|
|
|
|
if not nvalues:
|
|
return (
|
|
asarray([], dtype=int64),
|
|
asarray(categories, dtype=object),
|
|
reverse_categories,
|
|
)
|
|
|
|
cdef:
|
|
Py_ssize_t i
|
|
Py_ssize_t missing_code = reverse_categories[missing_value]
|
|
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
|
|
|
|
for i in range(nvalues):
|
|
codes[i] = reverse_categories.get(values[i], missing_code)
|
|
|
|
return codes, asarray(categories, dtype=object), reverse_categories
|
|
|
|
|
|
cpdef factorize_strings(ndarray[object] values,
|
|
object missing_value,
|
|
int sort):
|
|
"""
|
|
Factorize an array of (possibly duplicated) labels into an array of indices
|
|
into a unique array of labels.
|
|
|
|
This is ~30% faster than pandas.factorize, at the cost of not having
|
|
special treatment for NaN, which we don't care about because we only
|
|
support arrays of strings.
|
|
|
|
(Though it's faster even if you throw in the nan checks that pandas does,
|
|
because we're using dict and list instead of PyObjectHashTable and
|
|
ObjectVector. Python's builtin data structures are **really**
|
|
well-optimized.)
|
|
"""
|
|
cdef:
|
|
Py_ssize_t nvalues = len(values)
|
|
list categories = [missing_value]
|
|
dict reverse_categories = {missing_value: 0}
|
|
|
|
# Short circuit on empty array.
|
|
if not nvalues:
|
|
return (
|
|
asarray([], dtype=int64),
|
|
asarray(categories, dtype=object),
|
|
reverse_categories,
|
|
)
|
|
|
|
cdef:
|
|
Py_ssize_t i, code
|
|
object key = None
|
|
ndarray[int64_t] codes = empty(nvalues, dtype=int64)
|
|
|
|
for i in range(nvalues):
|
|
key = values[i]
|
|
code = reverse_categories.get(key, -1)
|
|
if code == -1:
|
|
# Assign new code.
|
|
code = len(reverse_categories)
|
|
reverse_categories[key] = code
|
|
categories.append(key)
|
|
codes[i] = code
|
|
|
|
cdef ndarray[int64_t, ndim=1] sorter
|
|
cdef ndarray[int64_t, ndim=1] reverse_indexer
|
|
cdef int ncategories
|
|
cdef ndarray[object] categories_array = asarray(categories, dtype=object)
|
|
|
|
if sort:
|
|
# This is all adapted from pandas.core.algorithms.factorize.
|
|
ncategories = len(categories_array)
|
|
sorter = zeros(ncategories, dtype=int64)
|
|
|
|
# Don't include missing_value in the argsort, because None is
|
|
# unorderable with bytes/str in py3. Always just sort it to 0.
|
|
sorter[1:] = categories_array[1:].argsort() + 1
|
|
reverse_indexer = empty(ncategories, dtype=int64)
|
|
reverse_indexer.put(sorter, arange(ncategories))
|
|
|
|
codes = reverse_indexer.take(codes)
|
|
categories_array = categories_array.take(sorter)
|
|
reverse_categories = dict(zip(categories_array, range(ncategories)))
|
|
|
|
return codes, categories_array, reverse_categories
|