mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-02 07:19:44 +08:00
66d05aa563
The BcolzDailyBarReader was optimized for the pipeline case of reading all assets at once. Now that the reader is also used to support daily history the case of reading a data for a small number of assets is more common, particularly in algorithms that use the history API which have a high rotation of assets (e.g. an algorithm which pipeline uses to set the active universe) Remove the bottleneck in reading a small number of assets by conditionally reading the slice for each asset from the carray, instead of reading the data for all equities and then indexing into that full array. On a certain number of assets, it is still better to read all the data at once. On the Quantopian dataset, which holds data for 20000 about for the last 10 years of equity data (where not all equities trade over the full range), stored in 118 blosc blp files per column, the tipping point where the 'read all' mode wins out between 3000-4000 assets. That number was tested by trying to exercise a worst case scenario where the equities were spread out evenly across the blp files, by stepping along a sorted list of assets that were alive over a query range which spanned 70 trading days. ``` size = 3000 sids = [assets[i] for i in range(0, len(assets), len(assets) / size)][:size] ``` Also, add parameter to WithBcolzDailyBarReader fixture which allows the test to specify what the threshold count for reading all data should be, so that the test_us_equity_pricing can be forced into either mode to make sure that both branches in logic are covered by all test cases. On local dev machine this patch improves the read time of `load_raw_array` for one asset from 100 ms to 96.5 µs. (10^5 improvement.) With reading only asset per call a being an observed common case when populating the non-cached values in USEquityHistoryLoader.
224 lines
7.4 KiB
Cython
224 lines
7.4 KiB
Cython
#
|
|
# Copyright 2015 Quantopian, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import bcolz
|
|
cimport cython
|
|
from cpython cimport bool
|
|
|
|
from numpy import (
|
|
array,
|
|
float64,
|
|
intp,
|
|
uint32,
|
|
zeros,
|
|
)
|
|
from numpy cimport (
|
|
float64_t,
|
|
intp_t,
|
|
ndarray,
|
|
uint32_t,
|
|
uint8_t,
|
|
)
|
|
from numpy.math cimport NAN
|
|
|
|
ctypedef object carray_t
|
|
ctypedef object ctable_t
|
|
ctypedef object Timestamp_t
|
|
ctypedef object DatetimeIndex_t
|
|
ctypedef object Int64Index_t
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
cpdef _compute_row_slices(dict asset_starts_absolute,
|
|
dict asset_ends_absolute,
|
|
dict asset_starts_calendar,
|
|
intp_t query_start,
|
|
intp_t query_end,
|
|
Int64Index_t requested_assets):
|
|
"""
|
|
Core indexing functionality for loading raw data from bcolz.
|
|
|
|
Parameters
|
|
----------
|
|
asset_starts_absolute : dict
|
|
Dictionary containing the index of the first row of each asset in the
|
|
bcolz file from which we will query.
|
|
|
|
asset_ends_absolute : dict
|
|
Dictionary containing the index of the last row of each asset in the
|
|
bcolz file from which we will query.
|
|
|
|
asset_starts_calendar : dict
|
|
Dictionary containing the index of in our calendar corresponding to the
|
|
start date of each asset
|
|
|
|
query_start : intp
|
|
query_end : intp
|
|
Start and end indices in our calendar of the dates for which we're
|
|
querying.
|
|
|
|
requested_assets : pandas.Int64Index
|
|
The assets for which we want to load data.
|
|
|
|
For each asset in requested assets, computes three values:
|
|
1.) The index in the raw bcolz data of first row to load.
|
|
2.) The index in the raw bcolz data of the last row to load.
|
|
3.) The index in the dates of our query corresponding to the first row for
|
|
each asset. This is non-zero iff the asset's lifetime begins partway
|
|
through the requested query dates.
|
|
|
|
Returns
|
|
-------
|
|
first_rows, last_rows, offsets : 3-tuple of ndarrays
|
|
"""
|
|
cdef:
|
|
intp_t nassets = len(requested_assets)
|
|
|
|
# For each sid, we need to compute the following:
|
|
ndarray[dtype=intp_t, ndim=1] first_row_a = zeros(nassets, dtype=intp)
|
|
ndarray[dtype=intp_t, ndim=1] last_row_a = zeros(nassets, dtype=intp)
|
|
ndarray[dtype=intp_t, ndim=1] offset_a = zeros(nassets, dtype=intp)
|
|
|
|
# Loop variables.
|
|
intp_t i
|
|
intp_t asset
|
|
intp_t asset_start_data
|
|
intp_t asset_end_data
|
|
intp_t asset_start_calendar
|
|
intp_t asset_end_calendar
|
|
|
|
for i, asset in enumerate(requested_assets):
|
|
asset_start_data = asset_starts_absolute[asset]
|
|
asset_end_data = asset_ends_absolute[asset]
|
|
asset_start_calendar = asset_starts_calendar[asset]
|
|
asset_end_calendar = (
|
|
asset_start_calendar + (asset_end_data - asset_start_data)
|
|
)
|
|
|
|
# If the asset started during the query, then start with the asset's
|
|
# first row.
|
|
# Otherwise start with the asset's first row + the number of rows
|
|
# before the query on which the asset existed.
|
|
first_row_a[i] = (
|
|
asset_start_data + max(0, (query_start - asset_start_calendar))
|
|
)
|
|
# If the asset ended during the query, the end with the asset's last
|
|
# row.
|
|
# Otherwise, end with the asset's last row minus the number of rows
|
|
# after the query for which the asset
|
|
last_row_a[i] = (
|
|
asset_end_data - max(0, asset_end_calendar - query_end)
|
|
)
|
|
# If the asset existed on or before the query, no offset.
|
|
# Otherwise, offset by the number of rows in the query in which the
|
|
# asset did not yet exist.
|
|
offset_a[i] = max(0, asset_start_calendar - query_start)
|
|
|
|
return first_row_a, last_row_a, offset_a
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
cpdef _read_bcolz_data(ctable_t table,
|
|
tuple shape,
|
|
list columns,
|
|
intp_t[:] first_rows,
|
|
intp_t[:] last_rows,
|
|
intp_t[:] offsets,
|
|
bool read_all):
|
|
"""
|
|
Load raw bcolz data for the given columns and indices.
|
|
|
|
Parameters
|
|
----------
|
|
table : bcolz.ctable
|
|
The table from which to read.
|
|
shape : tuple (length 2)
|
|
The shape of the expected output arrays.
|
|
columns : list[str]
|
|
List of column names to read.
|
|
|
|
first_rows : ndarray[intp]
|
|
last_rows : ndarray[intp]
|
|
offsets : ndarray[intp
|
|
Arrays in the format returned by _compute_row_slices.
|
|
read_all : bool
|
|
Whether to read_all sid data at once, or to read a silce from the
|
|
carray for each sid.
|
|
|
|
Returns
|
|
-------
|
|
results : list of ndarray
|
|
A 2D array of shape `shape` for each column in `columns`.
|
|
"""
|
|
cdef:
|
|
int nassets
|
|
str column_name
|
|
carray_t carray
|
|
ndarray[dtype=uint32_t, ndim=1] raw_data
|
|
ndarray[dtype=uint32_t, ndim=2] outbuf
|
|
ndarray[dtype=uint8_t, ndim=2, cast=True] where_nan
|
|
ndarray[dtype=float64_t, ndim=2] outbuf_as_float
|
|
intp_t asset
|
|
intp_t out_idx
|
|
intp_t raw_idx
|
|
intp_t first_row
|
|
intp_t last_row
|
|
intp_t offset
|
|
list results = []
|
|
|
|
ndays = shape[0]
|
|
nassets = shape[1]
|
|
if not nassets== len(first_rows) == len(last_rows) == len(offsets):
|
|
raise ValueError("Incompatible index arrays.")
|
|
|
|
for column_name in columns:
|
|
outbuf = zeros(shape=shape, dtype=uint32)
|
|
if read_all:
|
|
raw_data = table[column_name][:]
|
|
|
|
for asset in range(nassets):
|
|
first_row = first_rows[asset]
|
|
last_row = last_rows[asset]
|
|
offset = offsets[asset]
|
|
if first_row <= last_row:
|
|
outbuf[offset:offset + (last_row + 1 - first_row), asset] =\
|
|
raw_data[first_row:last_row + 1]
|
|
else:
|
|
continue
|
|
else:
|
|
carray = table[column_name]
|
|
|
|
for asset in range(nassets):
|
|
first_row = first_rows[asset]
|
|
last_row = last_rows[asset]
|
|
offset = offsets[asset]
|
|
out_start = offset
|
|
out_end = (last_row - first_row) + offset + 1
|
|
if first_row <= last_row:
|
|
outbuf[offset:offset + (last_row + 1 - first_row), asset] =\
|
|
carray[first_row:last_row + 1]
|
|
else:
|
|
continue
|
|
|
|
if column_name in {'open', 'high', 'low', 'close'}:
|
|
where_nan = (outbuf == 0)
|
|
outbuf_as_float = outbuf.astype(float64) * .001
|
|
outbuf_as_float[where_nan] = NAN
|
|
results.append(outbuf_as_float)
|
|
else:
|
|
results.append(outbuf)
|
|
return results
|