ENH: remove the ffill lower bound query in blaze loader

This query is often only cutting out a couple of months or a week of
data. The cost of computing this lower bound does not outway the cost of
sending back too much data.
This commit is contained in:
Joe Jevnik
2016-05-24 21:00:37 -04:00
parent 2d36a58add
commit 163ba8d76d
+1 -33
View File
@@ -904,44 +904,12 @@ class BlazeLoader(dict):
q : Expr
The query to run.
"""
def lower_for_col(column):
pred = e[TS_FIELD_NAME] <= lower_dt
colname = column.name
schema = e[colname].schema.measure
if isinstance(schema, Option):
pred &= e[colname].notnull()
schema = schema.ty
if schema in floating:
pred &= ~e[colname].isnan()
filtered = e[pred]
lower = filtered[TS_FIELD_NAME].max()
if have_sids:
# If we have sids, then we need to take the earliest of the
# greatest date that has a non-null value by sid.
lower = bz.by(
filtered[SID_FIELD_NAME],
timestamp=lower,
).timestamp.min()
return lower
lower = odo(
reduce(
bz.least,
map(lower_for_col, columns),
),
pd.Timestamp,
**odo_kwargs
)
if lower is pd.NaT:
lower = lower_dt
return e[
(e[TS_FIELD_NAME] >= lower) &
(e[TS_FIELD_NAME] <= upper_dt)
][added_query_fields + list(map(getname, columns))]
def collect_expr(e):
"""Execute and merge all of the per-column subqueries.
"""Materialize the expression as a dataframe.
Parameters
----------