diff --git a/.gitignore b/.gitignore index 136e3d3e0..8166a90a4 100644 --- a/.gitignore +++ b/.gitignore @@ -113,3 +113,8 @@ cmake-build-debug/ # Gradle: .idea/**/gradle.xml .idea/**/libraries + +# Website +/site/Gemfile.lock +/site/.sass-cache +/site/_site diff --git a/site/Gemfile b/site/Gemfile new file mode 100644 index 000000000..8af267397 --- /dev/null +++ b/site/Gemfile @@ -0,0 +1,28 @@ +source "https://rubygems.org" +ruby RUBY_VERSION + +# Hello! This is where you manage which Jekyll version is used to run. +# When you want to use a different version, change it below, save the +# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: +# +# bundle exec jekyll serve +# +# This will help ensure the proper Jekyll version is running. +# Happy Jekylling! +gem "jekyll", "3.4.3" + +# This is the default theme for new Jekyll sites. You may change this to anything you like. +gem "minima", "~> 2.0" + +# If you want to use GitHub Pages, remove the "gem "jekyll"" above and +# uncomment the line below. To upgrade, run `bundle update github-pages`. +# gem "github-pages", group: :jekyll_plugins + +# If you have any plugins, put them here! +group :jekyll_plugins do + gem "jekyll-feed", "~> 0.6" +end + +# Windows does not include zoneinfo files, so bundle the tzinfo-data gem +gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] + diff --git a/site/README.md b/site/README.md new file mode 100644 index 000000000..ff55def0f --- /dev/null +++ b/site/README.md @@ -0,0 +1,24 @@ +# Ray Website + +## Development instructions + +With Ruby >= 2.1 installed, run: + +``` +gem install jekyll bundler +bundle install +``` + +To view the site, run: + +``` +bundle exec jekyll serve +``` + +## Deployment + +To deploy the site, run (inside the main ray directory): + +``` +git subtree push --prefix site origin gh-pages +``` diff --git a/site/_config.yml b/site/_config.yml new file mode 100644 index 000000000..5f7f5d4a0 --- /dev/null +++ b/site/_config.yml @@ -0,0 +1,31 @@ +# Welcome to Jekyll! +# +# This config file is meant for settings that affect your whole blog, values +# which you are expected to set up once and rarely edit after that. If you find +# yourself editing this file very often, consider using Jekyll's data files +# feature for the data you need to update frequently. +# +# For technical reasons, this file is *NOT* reloaded automatically when you use +# 'bundle exec jekyll serve'. If you change this file, please restart the server process. + +# Site settings +# These are used to personalize your new site. If you look in the HTML files, +# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on. +# You can create any custom variable you would like, and they will be accessible +# in the templates via {{ site.myvariable }}. +title: "Ray: A Distributed Execution Framework for AI Applications" +email: "" +description: > # this means to ignore newlines until "baseurl:" + Ray is a flexible, high-performance distributed execution framework for AI applications. +baseurl: "" # the subpath of your site, e.g. /blog +url: "" # the base hostname & protocol for your site, e.g. http://example.com +github_username: ray-project + +# Build settings +markdown: kramdown +theme: minima +gems: + - jekyll-feed +exclude: + - Gemfile + - Gemfile.lock diff --git a/site/_posts/2017-05-17-announcing-ray.markdown b/site/_posts/2017-05-17-announcing-ray.markdown new file mode 100644 index 000000000..5ba97078b --- /dev/null +++ b/site/_posts/2017-05-17-announcing-ray.markdown @@ -0,0 +1,231 @@ +--- +layout: post +title: "Ray: A Distributed Execution Framework for AI Applications" +excerpt: "This post announces Ray, a framework for efficiently running Python code on clusters and large mult-core machines." +date: 2017-05-20 14:00:00 +--- + +This post announces Ray, a framework for efficiently running Python code on +clusters and large multi-core machines. The project is open source. +You can check out [the code](https://github.com/ray-project/ray) and +[the documentation](http://ray.readthedocs.io/en/latest/?badge=latest). + +Many AI algorithms are computationally intensive and exhibit complex +communication patterns. As a result, many researchers spend most of their +time building custom systems to efficiently distribute their code across +clusters of machines. + +However, the resulting systems are often specific to a single algorithm or class +of algorithms. We built Ray to help eliminate a bunch of the redundant +engineering effort that is currently repeated over and over for each new +algorithm. Our hope is that a few basic primitives can be reused to implement +and to efficiently execute a broad range of algorithms and applications. + +
+ +
A simulated robot learning to run using Ray.
+
+
+ +# Simple Parallelization of Existing Code + +Ray enables Python functions to be executed remotely with minimal modifications. + +With **regular Python**, when you call a function, the call blocks until the +function has been executed. This example would take 8 seconds to execute. + +{% highlight python %} +def f(): + time.sleep(1) + +# Calls to f executed serially. +results = [] +for _ in range(8): + result = f() + results.append(result) +{% endhighlight %} + +**With Ray**, when you call a **remote function**, the call immediately returns +an object ID. A task is then created, scheduled, and executed somewhere in the +cluster. This example would take 1 second to execute. + +{% highlight python %} +@ray.remote +def f(): + time.sleep(1) + +# Tasks executed in parallel. +results = [] +for _ in range(8): + result = f.remote() + results.append(result) + +results = ray.get(results) +{% endhighlight %} + +Note that the only changes are that we add the ``@ray.remote`` decorator to the +function definition, we call the function with ``f.remote()``, and we call +``ray.get`` on the list of object IDs in order to block until the corresponding +tasks have finished executing. + +
+ +
+
A graph depicting the tasks and objects in this example. The circles +represent tasks, and the boxes represent objects. There are no arrows between +the 8 separate tasks indicating that all of the tasks can be executed in +parallel.
+
+ +# Flexible Encoding of Task Dependencies + +In contrast with bulk-synchronous parallel frameworks like MapReduce or Apache +Spark, Ray is designed to support AI applications which require fine-grained +task dependencies. In contrast with the computation of aggregate statistics of +an entire dataset, a training procedure may operate on a small subset of data or +on the outputs of a handful of tasks. + +Dependencies can be encoded by passing object IDs (which are the outputs of +tasks) into other tasks. + +{% highlight python %} +import numpy as np + +@ray.remote +def aggregate_data(x, y): + return x + y + +data = [np.random.normal(size=1000) for i in range(4)] + +while len(data) > 1: + intermediate_result = aggregate_data.remote(data[0], data[1]) + data = data[2:] + [intermediate_result] + +result = ray.get(data[0]) +{% endhighlight %} + +By passing the outputs of some calls to `aggregate_data` into subsequent calls +to `aggregate_data`, we encode dependencies between these tasks which can be +used by the system to make scheduling decisions and to coordinate the transfer +of objects. Note that when object IDs are passed into remote function calls, the +actual values will be unpacked before the function is executed, so when the +`aggregate_data` function is executed, `x` and `y` will be numpy arrays. + +
+ +
+
A graph depicting the tasks and objects in this example. The circles +represent tasks, and the boxes represent objects. Arrows point from tasks to the +objects they produce and from objects to the tasks that depend on +them.
+
+ +# Shared Mutable State with Actors + +Ray uses actors to share mutable state between tasks. Here is an example in +which multiple tasks share the state of an Atari simulator. Each task runs the +simulator for several steps picking up where the previous task left off. + +{% highlight python %} +import gym + +@ray.remote +class Simulator(object): + def __init__(self): + self.env = gym.make("Pong-v0") + self.env.reset() + + def step(self, action): + return self.env.step(action) + +# Create a simulator, this will start a new worker that will run all +# methods for this actor. +simulator = Simulator.remote() + +observations = [] +for _ in range(4): + # Take action 0 in the simulator. + observations.append(simulator.step.remote(0)) +{% endhighlight %} + +Each call to `simulator.step.remote` generates a task that is scheduled on the +actor. These tasks mutate the state of the simulator object, and they are +executed one at a time. + +Like remote functions, actor methods return object IDs that can be passed into +other tasks and whose values can be retrieved with `ray.get`. + +
+ +
+
A graph depicting the tasks and objects in this example. The circles +represent tasks, and the boxes represent objects. The first task is the actor's +constructor. The thick arrows are used to show that the methods invoked on this +actor share the underlying state of the actor.
+
+ +# Efficient Shared Memory and Serialization with Apache Arrow + +Serializing and deserializing data is often a bottleneck in distributed +computing. Ray lets worker processes on the same machine access the same objects +through shared memory. To facilitate this, Ray uses an in-memory object store on +each machine to serve objects. + +To illustrate the problem, suppose we create some neural network weights and +wish to ship them from one Python process to another. + +{% highlight python %} +import numpy as np + +weights = {"Variable{}".format(i): np.random.normal(size=5000000) + for i in range(10)} # 2.68s +{% endhighlight %} + +To ship the neural network weights around, we need to first serialize them into +a contiguous blob of bytes. This can be done with standard serialization +libraries like pickle. + +{% highlight python %} +import pickle + +# Serialize the weights with pickle. Then deserialize them. +pickled_weights = pickle.dumps(weights) # 0.986s +new_weights = pickle.loads(pickled_weights) # 0.241s +{% endhighlight %} + +The time required for deserialization is particularly important because one of +the most common patterns in machine learning is to aggregate a large number of +values (for example, neural net weights, rollouts, or other values) in a single +process, so the deserialization step could happen hundreds of times in a row. + +To minimize the time required to deserialize objects in shared memory, we use +the [Apache Arrow](https://arrow.apache.org/) data layout. This allows us to +compute offsets into the serialized blob without scanning through the entire +blob. **In practice, this can translate into deserialization that is several +orders of magnitude faster**. + +{% highlight python %} +# Serialize the weights and copy them into the object store. Then deserialize +# them. +weights_id = ray.put(weights) # 0.525s +new_weights = ray.get(weights_id) # 0.000622s +{% endhighlight %} + +The call to `ray.put` serializes the weights using Arrow and copies the result +into the object store's memory. The call to `ray.get` then deserializes the +serialized object and constructs a new dictionary of numpy arrays. However, the +underlying arrays backing the numpy arrays live in shared memory and are not +copied into the Python process's heap. + +Note that if the call to `ray.get` happens from a different machine, the +relevant serialized object will be copied from a machine where it lives to the +machine where it is needed. + +In this example, we call `ray.put` explicitly. However, normally this call would +happen under the hood when a Python object is passed into a remote function or +returned from a remote function. + +# Feedback is Appreciated + +This project is in its early stages. If you try it out, we'd love to hear your +thoughts and suggestions. diff --git a/site/assets/announcing_ray/graph1.png b/site/assets/announcing_ray/graph1.png new file mode 100644 index 000000000..1817286e4 Binary files /dev/null and b/site/assets/announcing_ray/graph1.png differ diff --git a/site/assets/announcing_ray/graph2.png b/site/assets/announcing_ray/graph2.png new file mode 100644 index 000000000..3aad1d0e2 Binary files /dev/null and b/site/assets/announcing_ray/graph2.png differ diff --git a/site/assets/announcing_ray/graph3.png b/site/assets/announcing_ray/graph3.png new file mode 100644 index 000000000..6f128b270 Binary files /dev/null and b/site/assets/announcing_ray/graph3.png differ diff --git a/site/index.html b/site/index.html new file mode 100644 index 000000000..bbb7a0329 --- /dev/null +++ b/site/index.html @@ -0,0 +1,13 @@ +--- +layout: default +--- + +
+ +{% for post in site.posts %} + {{ post.date | date: "%b %-d, %Y" }} + {{ post.title }} + {{ post.excerpt }} +{% endfor %} + +