Files
catalyst/zipline/lib/adjusted_array.pyx
T
Scott Sanderson 41d4133c74 BUG: Use NAN from numpy.
MSVC doesn't define NAN in math.h because they only implement C89.

See http://tdistler.com/2011/03/24/how-to-define-nan-not-a-number-on-windows.
2015-08-21 11:33:20 -04:00

226 lines
6.2 KiB
Cython

"""
Class capable of yielding adjusted chunks of an ndarray.
"""
from cpython cimport (
Py_EQ,
PyObject_RichCompare,
)
from numpy import (
asarray,
bool_,
float64,
full,
uint8,
)
from numpy cimport (
float64_t,
ndarray,
uint8_t,
)
from zipline.errors import (
WindowLengthNotPositive,
WindowLengthTooLong,
)
cdef double NAN = float64('nan')
NOMASK = None
def ensure_ndarray(ndarray_or_adjusted_array):
"""
Return the input as a numpy ndarray.
This is a no-op if the input is already an ndarray. If the input is an
adjusted_array, this extracts a read-only view of its internal data buffer.
Parameters
----------
ndarray_or_adjusted_array : numpy.ndarray | zipline.data.adjusted_array
Returns
-------
out : The input, converted to an ndarray.
"""
if isinstance(ndarray_or_adjusted_array, ndarray):
return ndarray_or_adjusted_array
elif isinstance(ndarray_or_adjusted_array, AdjustedArray):
return ndarray_or_adjusted_array.data
else:
raise TypeError(
"Can't convert %s to ndarray" %
type(ndarray_or_adjusted_array).__name__
)
cpdef adjusted_array(ndarray data, ndarray mask, dict adjustments):
"""
Factory function for producing adjusted arrays on inputs of different
dtypes.
If mask is None, the array is assumed to contain all valid data points.
Otherwise mask should be an array of uint8 of the same shape
as data, containing 0s for valid values and 1s for invalid values.
"""
if data.dtype != float64:
data = data.astype(float64)
if mask is not NOMASK:
if mask.dtype == bool_:
# Cython isn't smart enough to make this coercion even though the
# arrays are bools internally.
mask = mask.view(uint8)
return Float64AdjustedArray(data, mask, adjustments)
cdef _check_window_length(object data, int window_length):
if window_length < 1:
raise WindowLengthNotPositive(window_length=window_length)
if window_length > data.shape[0]:
raise WindowLengthTooLong(
nrows=data.shape[0],
window_length=window_length,
)
cdef class AdjustedArray:
property data:
def __get__(self):
out = asarray(self._data, dtype=self.dtype)
out.setflags(write=False)
return out
cdef class Float64AdjustedArray(AdjustedArray):
"""
Adjusted array of float64.
"""
cdef:
readonly float64_t[:, :] _data
dict adjustments
def __cinit__(self,
float64_t[:, :] data not None,
uint8_t[:, :] mask, # None is equivalent to all 0s.
dict adjustments):
cdef Py_ssize_t row, col
if mask is not NOMASK:
if not PyObject_RichCompare(mask.shape, data.shape, Py_EQ):
raise ValueError(
"Mask shape %s != data shape %s" % (
(mask.shape[0], mask.shape[1]),
(data.shape[0], data.shape[1]),
)
)
# Fill in NaNs for the mask.
for row in range(mask.shape[0]):
for col in range(mask.shape[1]):
if not mask[row, col]:
data[row, col] = NAN
self._data = data
self.adjustments = adjustments
property dtype:
def __get__(self):
return float64
cpdef traverse(self, Py_ssize_t window_length, Py_ssize_t offset=0):
return _Float64AdjustedArrayWindow(
self._data.copy(),
self.adjustments,
window_length,
offset,
)
cdef class _Float64AdjustedArrayWindow:
"""
An iterator representing a moving view over an AdjustedArray.
This object stores a copy of the data from the AdjustedArray over which
it's iterating. At each step in the iteration, it mutates its copy to
allow us to show different data when looking back over the array.
The arrays yielded by this iterator are always views over the underlying
data.
"""
cdef float64_t[:, :] data
cdef readonly Py_ssize_t window_length
cdef Py_ssize_t anchor, max_anchor, next_adj
cdef dict adjustments
cdef list adjustment_indices
def __cinit__(self,
float64_t[:, :] data,
dict adjustments,
Py_ssize_t window_length,
Py_ssize_t offset):
_check_window_length(data, window_length)
self.data = data
self.window_length = window_length
# anchor is the index of the row **after** the row from which we're
# looking back.
self.anchor = window_length + offset
self.max_anchor = data.shape[0]
self.adjustments = adjustments
self.adjustment_indices = sorted(adjustments, reverse=True)
if len(self.adjustment_indices) > 0:
self.next_adj = self.adjustment_indices.pop()
else:
self.next_adj = self.max_anchor
def __iter__(self):
return self
def __next__(self):
cdef:
ndarray[float64_t, ndim=2] out
object adjustment
Py_ssize_t start, anchor
anchor = self.anchor
if anchor > self.max_anchor:
raise StopIteration()
# Apply any adjustments that occured before our current anchor.
# Equivalently, apply any adjustments known **on or before** the date
# for which we're calculating a window.
while self.next_adj < anchor:
for adjustment in self.adjustments[self.next_adj]:
adjustment.mutate(self.data)
if len(self.adjustment_indices) > 0:
self.next_adj = self.adjustment_indices.pop()
else:
self.next_adj = self.max_anchor
start = anchor - self.window_length
out = asarray(self.data[start:self.anchor])
out.setflags(write=False)
self.anchor += 1
return out
def __repr__(self):
return "%s(window_length=%d, anchor=%d, max_anchor=%d)" % (
type(self).__name__,
self.window_length,
self.anchor,
self.max_anchor,
)