Package pyarrow along with ray. (#822)

* Rough pass at installing pyarrow along with Ray.

* Remove hardcoded path and try to find correct path automatically.

* Add print.

* Fix linting.

* Copy pyarrow files to a location that we manually add to python path in order to avoid interfering with pre-existing pyarrow installations.

* Move call to build.sh back into build_ext in setup.py.

* Ignore some linting errors.

* Fix problem in which pyarrow files to copy were listed before they were built.

* Fix tests by importing ray before pyarrow.
This commit is contained in:
Robert Nishihara
2017-08-07 21:17:28 -07:00
committed by Philipp Moritz
parent 0e6e38115f
commit 03f2325780
6 changed files with 59 additions and 46 deletions
+13 -4
View File
@@ -2,11 +2,20 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
# Add the directory containing pyarrow to the Python path so that we find the
# pyarrow version packaged with ray and not a pre-existing pyarrow.
pyarrow_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
"pyarrow_files")
sys.path.insert(0, pyarrow_path)
from ray.worker import (register_class, error_info, init, connect, disconnect,
get, put, wait, remote, log_event, log_span,
flush_log, get_gpu_ids)
from ray.worker import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
from ray.worker import global_state
flush_log, get_gpu_ids) # noqa: E402
from ray.worker import (SCRIPT_MODE, WORKER_MODE, PYTHON_MODE,
SILENT_MODE) # noqa: E402
from ray.worker import global_state # noqa: E402
# We import ray.actor because some code is run in actor.py which initializes
# some functions in the worker.
import ray.actor # noqa: F401
@@ -20,7 +29,7 @@ __all__ = ["register_class", "error_info", "init", "connect", "disconnect",
"flush_log", "actor", "get_gpu_ids", "SCRIPT_MODE", "WORKER_MODE",
"PYTHON_MODE", "SILENT_MODE", "global_state", "__version__"]
import ctypes
import ctypes # noqa: E402
# Windows only
if hasattr(ctypes, "windll"):
# Makes sure that all child processes die when we die. Also makes sure that
+3 -2
View File
@@ -10,14 +10,15 @@ import sys
import time
import unittest
import pyarrow as pa
# The ray import must come before the pyarrow import because ray modifies the
# python path so that the right version of pyarrow is found.
import ray.global_scheduler as global_scheduler
import ray.local_scheduler as local_scheduler
import ray.plasma as plasma
from ray.plasma.utils import create_object
from ray import services
from ray.experimental import state
import pyarrow as pa
USE_VALGRIND = False
PLASMA_STORE_MEMORY = 1000000000
+4 -2
View File
@@ -13,12 +13,14 @@ import threading
import time
import unittest
import pyarrow as pa
import pyarrow.plasma as plasma
# The ray import must come before the pyarrow import because ray modifies the
# python path so that the right version of pyarrow is found.
import ray
from ray.plasma.utils import (random_object_id,
create_object_with_id, create_object)
from ray import services
import pyarrow as pa
import pyarrow.plasma as plasma
USE_VALGRIND = False
PLASMA_STORE_MEMORY = 1000000000
View File
+31 -37
View File
@@ -10,35 +10,42 @@ import sys
from setuptools import setup, find_packages, Distribution
import setuptools.command.build_ext as _build_ext
# This used to be the first line of the run method in the build_ext class.
# However, we moved it here because the previous approach seemed to fail in
# Docker. Inside of the build.sh script, we install the pyarrow Python module.
# Something about calling "python setup.py install" inside of the build_ext
# run method doesn't work (this is easily reproducible in Docker with just a
# couple files to simulate two Python modules). The problem is that the pyarrow
# module doesn't get added to the easy-install.pth file, so it never gets added
# to the Python path even though the package is built and copied to the right
# location. An alternative fix would be to manually modify the easy-install.pth
# file. TODO(rkn): Fix all of this.
#
# Note: We are passing in sys.executable so that we use the same version of
# Python to build pyarrow inside the build.sh script. Note that certain flags
# will not be passed along such as --user or sudo. TODO(rkn): Fix this.
subprocess.check_call(["../build.sh", sys.executable])
# Ideally, we could include these files by putting them in a
# MANIFEST.in or using the package_data argument to setup, but the
# MANIFEST.in gets applied at the very beginning when setup.py runs
# before these files have been created, so we have to move the files
# manually.
ray_files = [
"ray/core/src/common/thirdparty/redis/src/redis-server",
"ray/core/src/common/redis_module/libray_redis_module.so",
"ray/core/src/plasma/plasma_store",
"ray/core/src/plasma/plasma_manager",
"ray/core/src/local_scheduler/local_scheduler",
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
"ray/core/src/numbuf/libnumbuf.so",
"ray/core/src/global_scheduler/global_scheduler",
"ray/WebUI.ipynb"
]
class build_ext(_build_ext.build_ext):
def run(self):
# The line below has been moved outside of the build_ext class. See the
# explanation there.
# subprocess.check_call(["../build.sh"])
# Note: We are passing in sys.executable so that we use the same
# version of Python to build pyarrow inside the build.sh script. Note
# that certain flags will not be passed along such as --user or sudo.
# TODO(rkn): Fix this.
subprocess.check_call(["../build.sh", sys.executable])
# We also need to install pyarrow along with Ray, so make sure that the
# relevant non-Python pyarrow files get copied.
pyarrow_files = [
os.path.join("ray/pyarrow_files/pyarrow", filename)
for filename in os.listdir("./ray/pyarrow_files/pyarrow")
if not os.path.isdir(os.path.join("ray/pyarrow_files/pyarrow",
filename))]
files_to_include = ray_files + pyarrow_files
# Ideally, we could include these files by putting them in a
# MANIFEST.in or using the package_data argument to setup, but the
# MANIFEST.in gets applied at the very beginning when setup.py runs
# before these files have been created, so we have to move the files
# manually.
for filename in files_to_include:
self.move_file(filename)
# Copy over the autogenerated flatbuffer Python bindings.
@@ -62,19 +69,6 @@ class build_ext(_build_ext.build_ext):
shutil.copy(source, destination)
files_to_include = [
"ray/core/src/common/thirdparty/redis/src/redis-server",
"ray/core/src/common/redis_module/libray_redis_module.so",
"ray/core/src/plasma/plasma_store",
"ray/core/src/plasma/plasma_manager",
"ray/core/src/local_scheduler/local_scheduler",
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
"ray/core/src/numbuf/libnumbuf.so",
"ray/core/src/global_scheduler/global_scheduler",
"ray/WebUI.ipynb"
]
class BinaryDistribution(Distribution):
def has_ext_modules(self):
return True
+8 -1
View File
@@ -62,4 +62,11 @@ cd $TP_DIR/arrow/python
# We set PKG_CONFIG_PATH, which is important so that in cmake, pkg-config can
# find plasma.
ARROW_HOME=$TP_DIR/arrow/cpp/build/cpp-install
PKG_CONFIG_PATH=$ARROW_HOME/lib/pkgconfig PYARROW_WITH_PLASMA=1 PYARROW_BUNDLE_ARROW_CPP=1 $PYTHON_EXECUTABLE setup.py install
PKG_CONFIG_PATH=$ARROW_HOME/lib/pkgconfig PYARROW_WITH_PLASMA=1 PYARROW_BUNDLE_ARROW_CPP=1 $PYTHON_EXECUTABLE setup.py build
PKG_CONFIG_PATH=$ARROW_HOME/lib/pkgconfig PYARROW_WITH_PLASMA=1 PYARROW_BUNDLE_ARROW_CPP=1 $PYTHON_EXECUTABLE setup.py build_ext
# Find the pyarrow directory that was just built and copy it to ray/python/ray/
# so that pyarrow can be packaged along with ray. TODO(rkn): This doesn't seem
# very robust. Fix this.
PYARROW_BUILD_LIB_DIR=$(find $TP_DIR/arrow/python/build -type d -maxdepth 1 -print | grep -m1 'lib')
echo "copying pyarrow files from $PYARROW_BUILD_LIB_DIR/pyarrow"
cp -r $PYARROW_BUILD_LIB_DIR/pyarrow $TP_DIR/../../python/ray/pyarrow_files/