BLD: improvements following unit tests

2026-06-27 21:06:32 +08:00 · 2017-12-22 15:45:23 -05:00
parent 662af595cb
commit 67bd5c8f6a
8 changed files with 279 additions and 12 deletions
@@ -1,6 +1,9 @@
 from logbook import Logger

 from catalyst.constants import LOG_LEVEL
+from catalyst.exchange.factory import find_exchanges
+
+import pandas as pd

 log = Logger('AssetFinderExchange', level=LOG_LEVEL)

@@ -97,3 +100,66 @@ class AssetFinderExchange(object):
            asset = exchange.get_asset(symbol, data_frequency)
            self._asset_cache[key] = asset
            return asset
+
+    def lifetimes(self, dates, include_start_date):
+        """
+        Compute a DataFrame representing asset lifetimes for the specified date
+        range.
+
+        Parameters
+        ----------
+        dates : pd.DatetimeIndex
+            The dates for which to compute lifetimes.
+        include_start_date : bool
+            Whether or not to count the asset as alive on its start_date.
+
+            This is useful in a backtesting context where `lifetimes` is being
+            used to signify "do I have data for this asset as of the morning of
+            this date?"  For many financial metrics, (e.g. daily close), data
+            isn't available for an asset until the end of the asset's first
+            day.
+
+        Returns
+        -------
+        lifetimes : pd.DataFrame
+            A frame of dtype bool with `dates` as index and an Int64Index of
+            assets as columns.  The value at `lifetimes.loc[date, asset]` will
+            be True iff `asset` existed on `date`.  If `include_start_date` is
+            False, then lifetimes.loc[date, asset] will be false when date ==
+            asset.start_date.
+
+        See Also
+        --------
+        numpy.putmask
+        catalyst.pipeline.engine.SimplePipelineEngine._compute_root_mask
+        """
+        exchanges = find_exchanges(features=['minuteBundle'])
+        if not exchanges:
+            raise ValueError('exchange with minute bundles not found')
+
+        # TODO: find a way to support multiple exchanges
+        exchange = exchanges[0]
+        # Using a single exchange for now because are not unique for the
+        # same asset in different exchanges. I'd like to avoid binding
+        # pipeline to a single exchange.
+        exchange.init()
+
+        data = []
+        for dt in dates:
+            exists = []
+
+            for asset in exchange.assets:
+                if include_start_date:
+                    condition = (asset.start_date <= dt < asset.end_minute)
+
+                else:
+                    condition = (asset.start_date < dt < asset.end_minute)
+
+                exists.append(condition)
+
+            data.append(exists)
+
+        sids = [asset.sid for asset in exchange.assets]
+        df = pd.DataFrame(data, index=dates, columns=sids)
+
+        return df
@@ -677,7 +677,8 @@ class ExchangeTradingAlgorithmLive(ExchangeTradingAlgorithmBase):
        self.perf_tracker.update_performance()

        frame_stats = self.prepare_period_stats(
-            data.current_dt, data.current_dt + timedelta(minutes=1))
+            data.current_dt, data.current_dt + timedelta(minutes=1)
+        )

        # Saving the last hour in memory
        self.frame_stats.append(frame_stats)
@@ -339,7 +339,7 @@ class DataPortalExchangeBacktest(DataPortalExchangeBase):
            field=field,
            data_frequency=adj_data_frequency,
            algo_end_dt=self._last_available_session,
-            trailing_bar_count=trailing_bar_count
+            trailing_bar_count=trailing_bar_count,
        )

        df = resample_history_df(pd.DataFrame(series), freq, field)
@@ -0,0 +1,158 @@
+# Copyright 2015 Quantopian, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from numpy import (
+    iinfo,
+    uint32,
+)
+
+from catalyst.data.us_equity_pricing import BcolzDailyBarReader
+from catalyst.lib.adjusted_array import AdjustedArray
+from catalyst.errors import NoFurtherDataError
+from catalyst.pipeline.data import DataSet, Column
+from catalyst.pipeline.loaders.base import PipelineLoader
+from catalyst.utils.calendars import get_calendar
+from catalyst.utils.numpy_utils import float64_dtype
+
+UINT32_MAX = iinfo(uint32).max
+
+
+class TradingPairPricing(DataSet):
+    """
+    Dataset representing daily trading prices and volumes.
+    """
+    open = Column(float64_dtype)
+    high = Column(float64_dtype)
+    low = Column(float64_dtype)
+    close = Column(float64_dtype)
+    volume = Column(float64_dtype)
+
+
+class ExchangePricingLoader(PipelineLoader):
+    """
+    PipelineLoader for Crypto Pricing data
+
+    Delegates loading of baselines and adjustments.
+    """
+
+    def __init__(self, data_frequency):
+
+        cal = get_calendar('OPEN')
+
+        if data_frequency == 'daily':
+            reader = None
+            all_sessions = cal.all_sessions
+
+        elif data_frequency == 'minute':
+            reader = None
+            all_sessions = cal.all_minutes
+
+        else:
+            raise ValueError(
+                'Invalid data frequency: {}'.format(data_frequency)
+            )
+
+        self.raw_price_loader = reader
+        self._columns = TradingPairPricing.columns
+        self._all_sessions = all_sessions
+
+    @classmethod
+    def from_files(cls, pricing_path):
+        """
+        Create a loader from a bcolz equity pricing dir and a SQLite
+        adjustments path.
+
+        Parameters
+        ----------
+        pricing_path : str
+            Path to a bcolz directory written by a BcolzDailyBarWriter.
+        """
+        return cls(
+            BcolzDailyBarReader(pricing_path),
+        )
+
+    def load_adjusted_array(self, columns, dates, assets, mask):
+        # load_adjusted_array is called with dates on which the user's algo
+        # will be shown data, which means we need to return the data that would
+        # be known at the start of each date.  We assume that the latest data
+        # known on day N is the data from day (N - 1), so we shift all query
+        # dates back by a day.
+        start_date, end_date = _shift_dates(
+            self._all_sessions, dates[0], dates[-1], shift=1,
+        )
+        colnames = [c.name for c in columns]
+        raw_arrays = self.raw_price_loader.load_raw_arrays(
+            colnames,
+            start_date,
+            end_date,
+            assets,
+        )
+
+        out = {}
+        for c, c_raw in zip(columns, raw_arrays):
+            out[c] = AdjustedArray(
+                c_raw.astype(c.dtype),
+                mask,
+                {},
+                c.missing_value,
+            )
+        return out
+
+    @property
+    def columns(self):
+        return self._columns
+
+
+def _shift_dates(dates, start_date, end_date, shift):
+    try:
+        start = dates.get_loc(start_date)
+    except KeyError:
+        if start_date < dates[0]:
+            raise NoFurtherDataError(
+                msg=(
+                    "Pipeline Query requested data starting on {query_start}, "
+                    "but first known date is {calendar_start}"
+                ).format(
+                    query_start=str(start_date),
+                    calendar_start=str(dates[0]),
+                )
+            )
+        else:
+            raise ValueError("Query start %s not in calendar" % start_date)
+
+    # Make sure that shifting doesn't push us out of the calendar.
+    if start < shift:
+        raise NoFurtherDataError(
+            msg=(
+                "Pipeline Query requested data from {shift}"
+                " days before {query_start}, but first known date is only "
+                "{start} days earlier."
+            ).format(shift=shift, query_start=start_date, start=start),
+        )
+
+    try:
+        end = dates.get_loc(end_date)
+    except KeyError:
+        if end_date > dates[-1]:
+            raise NoFurtherDataError(
+                msg=(
+                    "Pipeline Query requesting data up to {query_end}, "
+                    "but last known date is {calendar_end}"
+                ).format(
+                    query_end=end_date,
+                    calendar_end=dates[-1],
+                )
+            )
+        else:
+            raise ValueError("Query end %s not in calendar" % end_date)
+    return dates[start - shift], dates[end - shift]
@@ -1,5 +1,6 @@
 import copy
 import csv
+import json
 import numbers
 import os
 import time
@@ -9,8 +10,10 @@ import pandas as pd
 from catalyst.assets._assets import TradingPair

 from catalyst.exchange.exchange_utils import get_algo_folder
+from catalyst.utils.paths import data_root

 s3_conn = []
+mailgun = []


 def trend_direction(series):
@@ -351,6 +354,35 @@ def stats_to_s3(uri, stats, algo_namespace, recorded_cols=None,
    obj.put(Body=bytes_to_write)


+def email_error(algo_name, dt, e, environ=None):
+    import requests
+    import traceback
+
+    if not mailgun:
+        root = data_root(environ)
+        filename = os.path.join(root, 'mailgun.json')
+        if not os.path.exists(filename):
+            raise ValueError(
+                'mailgun.json not found in the catalyst data folder'
+            )
+
+        with open(filename) as data_file:
+            mailgun.append(json.load(data_file))
+
+    mg = mailgun[0]
+
+    return requests.post(
+        mg['url'],
+        auth=("api", mg['api']),
+        data={
+            "from": mg['from'],
+            "to": mg['to'],
+            "subject": 'Error: {}'.format(algo_name),
+            "text": '{}\n\n{}\n{}'.format(
+                dt, e, traceback.format_exc()
+            )})
+
+
 def stats_to_algo_folder(stats, algo_namespace, recorded_cols=None):
    """
    Saves the performance stats to the algo local folder.
@@ -33,7 +33,6 @@ from catalyst.utils.sharedoc import copydoc


 class PipelineEngine(with_metaclass(ABCMeta)):
-
    @abstractmethod
    def run_pipeline(self, pipeline, start_date, end_date):
        """
@@ -118,6 +117,7 @@ class ExplodingPipelineEngine(PipelineEngine):
    """
    A PipelineEngine that doesn't do anything.
    """
+
    def run_pipeline(self, pipeline, start_date, end_date):
        raise NoEngineRegistered(
            "Attempted to run a pipeline but no pipeline "
@@ -484,8 +484,9 @@ class SimplePipelineEngine(PipelineEngine):
            )

            if isinstance(term, LoadableTerm):
+                term_key = loader_group_key(term)
                to_load = sorted(
-                    loader_groups[loader_group_key(term)],
+                    loader_groups[term_key],
                    key=lambda t: t.dataset
                )
                loader = get_loader(term)
@@ -12,7 +12,11 @@ from logbook import Logger

 from catalyst.data.bundles import load
 from catalyst.data.data_portal import DataPortal
+from catalyst.exchange.exchange_pricing_loader import ExchangePricingLoader, \
+    TradingPairPricing
 from catalyst.exchange.factory import get_exchange
+from catalyst.pipeline import USEquityPricingLoader
+from catalyst.pipeline.data import USEquityPricing

 try:
    from pygments import highlight
@@ -173,7 +177,14 @@ def _run(handle_data,
        asset_db_path=None  # We don't need an asset db, we have exchanges
    )
    env.asset_finder = AssetFinderExchange()
-    choose_loader = None  # TODO: use the DataPortal in the algo class for this
+
+    def choose_loader(column):
+        bound_cols = TradingPairPricing.columns
+        if column in bound_cols:
+            return ExchangePricingLoader(data_frequency)
+        raise ValueError(
+            "No PipelineLoader registered for column %s." % column
+        )

    if live:
        start = pd.Timestamp.utcnow()
@@ -1,17 +1,15 @@
 import random
-from datetime import timedelta
-
-from logbook import Logger
-from pandas.util.testing import assert_frame_equal

 import pandas as pd
+from logbook import Logger
+from pandas.util.testing import assert_frame_equal

 from catalyst import get_calendar
 from catalyst.exchange.asset_finder_exchange import AssetFinderExchange
 from catalyst.exchange.exchange_data_portal import DataPortalExchangeBacktest
 from catalyst.exchange.exchange_utils import get_candles_df
 from catalyst.exchange.factory import get_exchange
-from catalyst.exchange.test_utils import select_random_exchanges, output_df, \
+from catalyst.exchange.test_utils import output_df, \
    select_random_assets

 log = Logger('TestSuiteExchange')
@@ -94,9 +92,8 @@ class TestSuiteBundle:
        assert_frame_equal(
            right=data['bundle'],
            left=data['exchange'],
-            check_less_precise=True
+            check_less_precise=True,
        )
-        pass

    def test_validate_bundles(self):
        exchange_population = 3
@@ -144,3 +141,4 @@ class TestSuiteBundle:
                data_frequency=data_frequency,
                data_portal=data_portal,
            )
+        pass