Merge pull request #1 from ray-project/switch

Move plasma, common, and photon into the Ray repository.
This commit is contained in:
Philipp Moritz
2016-10-25 14:33:51 -07:00
committed by GitHub
92 changed files with 20798 additions and 3981 deletions
-14
View File
@@ -1,17 +1,3 @@
[submodule "thirdparty/grpc"]
path = thirdparty/grpc
url = https://github.com/grpc/grpc
ignore = dirty
[submodule "thirdparty/numbuf"]
path = thirdparty/numbuf
url = https://github.com/ray-project/numbuf.git
[submodule "thirdparty/arrow"]
path = thirdparty/arrow
url = https://github.com/ray-project/arrow.git
[submodule "thirdparty/python"]
path = thirdparty/python
url = https://github.com/austinsc/python.git
ignore = dirty
[submodule "thirdparty/hiredis"]
path = thirdparty/hiredis
url = https://github.com/redis/hiredis.git
+46
View File
@@ -0,0 +1,46 @@
CC = gcc
CFLAGS = -g -Wall --std=c99 -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L -fPIC -I. -Ithirdparty -Ithirdparty/ae -Wno-typedef-redefinition -Werror
BUILD = build
all: hiredis $(BUILD)/libcommon.a
$(BUILD)/libcommon.a: event_loop.o common.o task.o io.o state/redis.o thirdparty/ae/ae.o
ar rcs $@ $^
$(BUILD)/common_tests: test/common_tests.c $(BUILD)/libcommon.a
$(CC) -o $@ test/common_tests.c $(BUILD)/libcommon.a $(CFLAGS)
$(BUILD)/db_tests: hiredis test/db_tests.c $(BUILD)/libcommon.a
$(CC) -o $@ test/db_tests.c $(BUILD)/libcommon.a thirdparty/hiredis/libhiredis.a $(CFLAGS)
$(BUILD)/io_tests: test/io_tests.c $(BUILD)/libcommon.a
$(CC) -o $@ $^ $(CFLAGS)
$(BUILD)/task_tests: test/task_tests.c $(BUILD)/libcommon.a
$(CC) -o $@ $^ $(CFLAGS)
$(BUILD)/redis_tests: hiredis test/redis_tests.c $(BUILD)/libcommon.a logging.h
$(CC) -o $@ test/redis_tests.c logging.c $(BUILD)/libcommon.a thirdparty/hiredis/libhiredis.a $(CFLAGS)
clean:
rm -f *.o state/*.o test/*.o thirdparty/ae/*.o
rm -rf $(BUILD)/*
redis:
cd thirdparty ; bash ./build-redis.sh
hiredis:
git submodule update --init --recursive -- "thirdparty/hiredis" ; cd thirdparty/hiredis ; make
test: hiredis redis $(BUILD)/common_tests $(BUILD)/db_tests $(BUILD)/io_tests $(BUILD)/task_tests $(BUILD)/redis_tests FORCE
./thirdparty/redis-3.2.3/src/redis-server &
sleep 1s ; ./build/common_tests ; ./build/db_tests ; ./build/io_tests ; ./build/task_tests ; ./build/redis_tests
valgrind: test
valgrind --leak-check=full --error-exitcode=1 ./build/common_tests
valgrind --leak-check=full --error-exitcode=1 ./build/db_tests
valgrind --leak-check=full --error-exitcode=1 ./build/io_tests
valgrind --leak-check=full --error-exitcode=1 ./build/task_tests
valgrind --leak-check=full --error-exitcode=1 ./build/redis_tests
FORCE:
View File
+36
View File
@@ -0,0 +1,36 @@
#include "common.h"
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
const unique_id NIL_ID = {{255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255}};
unique_id globally_unique_id(void) {
/* Use /dev/urandom for "real" randomness. */
int fd;
if ((fd = open("/dev/urandom", O_RDONLY)) == -1) {
LOG_ERR("Could not generate random number");
}
unique_id result;
read(fd, &result.id[0], UNIQUE_ID_SIZE);
close(fd);
return result;
}
char *sha1_to_hex(const unsigned char *sha1, char *buffer) {
static const char hex[] = "0123456789abcdef";
char *buf = buffer;
for (int i = 0; i < UNIQUE_ID_SIZE; i++) {
unsigned int val = *sha1++;
*buf++ = hex[val >> 4];
*buf++ = hex[val & 0xf];
}
*buf = '\0';
return buffer;
}
+61
View File
@@ -0,0 +1,61 @@
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#ifndef RAY_COMMON_DEBUG
#define LOG_DEBUG(M, ...)
#else
#define LOG_DEBUG(M, ...) \
fprintf(stderr, "[DEBUG] (%s:%d) " M "\n", __FILE__, __LINE__, ##__VA_ARGS__)
#endif
#define LOG_ERR(M, ...) \
fprintf(stderr, "[ERROR] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, \
errno == 0 ? "None" : strerror(errno), ##__VA_ARGS__)
#define LOG_INFO(M, ...) \
fprintf(stderr, "[INFO] (%s:%d) " M "\n", __FILE__, __LINE__, ##__VA_ARGS__)
#define CHECK(COND) \
do { \
if (!(COND)) { \
LOG_ERR("Check failure: %s", #COND); \
exit(-1); \
} \
} while (0);
#define CHECKM(COND, M, ...) \
do { \
if (!(COND)) { \
LOG_ERR("Check failure: %s \n" M, #COND, ##__VA_ARGS__); \
exit(-1); \
} \
} while (0);
#define UNIQUE_ID_SIZE 20
/* Cleanup method for running tests with the greatest library.
* Runs the test, then clears the Redis database. */
#define RUN_REDIS_TEST(context, test) \
RUN_TEST(test); \
freeReplyObject(redisCommand(context, "FLUSHALL"));
typedef struct { unsigned char id[UNIQUE_ID_SIZE]; } unique_id;
extern const unique_id NIL_ID;
/* Generate a globally unique ID. */
unique_id globally_unique_id(void);
/* Convert a 20 byte sha1 hash to a hexdecimal string. This function assumes
* that buffer points to an already allocated char array of size 2 *
* UNIQUE_ID_SIZE + 1 */
char *sha1_to_hex(const unsigned char *sha1, char *buffer);
typedef unique_id object_id;
#endif
+33
View File
@@ -0,0 +1,33 @@
# Task specifications, task instances and task logs
A *task specification* contains all information that is needed for computing
the results of a task:
- The function ID of the function that executes the task
- The arguments (either object IDs for pass by reference
or values for pass by value)
- The IDs of the result objects
From these, a task ID can be computed which is also stored in the task
specification.
A *task instance* represents one execution of a task specification.
It consists of:
- A scheduling state (WAITING, SCHEDULED, RUNNING, DONE)
- The target node where the task is scheduled or executed
- A unique task instance ID that identifies the particular execution
of the task.
The task data structures are defined in `common/task.h`.
The *task log* is a mapping from the task instance ID to a sequence of
updates to the status of the task instance. It is updated by various parts
of the system:
1. The local scheduler writes it with status WAITING when submits a task to the global scheduler
2. The global scheduler appends an update WAITING -> SCHEDULED together with the node ID when assigning the task to a local scheduler
3. The local scheduler appends an update SCHEDULED -> RUNNING when it assigns a task to a worker
4. The local scheduler appends an update RUNNING -> DONE when the task finishes execution
The task log is defined in `common/state/task_log.h`.
+62
View File
@@ -0,0 +1,62 @@
#include "event_loop.h"
#include "common.h"
#include <errno.h>
#define INITIAL_EVENT_LOOP_SIZE 1024
event_loop *event_loop_create() {
return aeCreateEventLoop(INITIAL_EVENT_LOOP_SIZE);
}
void event_loop_destroy(event_loop *loop) {
/* Clean up timer events. This is to make valgrind happy. */
aeTimeEvent *te = loop->timeEventHead;
while (te) {
aeTimeEvent *next = te->next;
free(te);
te = next;
}
aeDeleteEventLoop(loop);
}
void event_loop_add_file(event_loop *loop,
int fd,
int events,
event_loop_file_handler handler,
void *context) {
/* Try to add the file descriptor. */
int err = aeCreateFileEvent(loop, fd, events, handler, context);
/* If it cannot be added, increase the size of the event loop. */
if (err == AE_ERR && errno == ERANGE) {
err = aeResizeSetSize(loop, 3 * aeGetSetSize(loop) / 2);
CHECK(err == AE_OK);
err = aeCreateFileEvent(loop, fd, events, handler, context);
}
/* In any case, test if there were errors. */
CHECK(err == AE_OK);
}
void event_loop_remove_file(event_loop *loop, int fd) {
aeDeleteFileEvent(loop, fd, EVENT_LOOP_READ | EVENT_LOOP_WRITE);
}
int64_t event_loop_add_timer(event_loop *loop,
int64_t milliseconds,
event_loop_timer_handler handler,
void *context) {
return aeCreateTimeEvent(loop, milliseconds, handler, context, NULL);
}
void event_loop_remove_timer(event_loop *loop, timer_id timer_id) {
int err = aeDeleteTimeEvent(loop, timer_id);
CHECK(err == AE_OK); /* timer id found? */
}
void event_loop_run(event_loop *loop) {
aeMain(loop);
}
void event_loop_stop(event_loop *loop) {
aeStop(loop);
}
+77
View File
@@ -0,0 +1,77 @@
#ifndef EVENT_LOOP_H
#define EVENT_LOOP_H
#include <stdint.h>
#include "ae/ae.h"
typedef long long timer_id;
typedef aeEventLoop event_loop;
/* File descriptor is readable. */
#define EVENT_LOOP_READ AE_READABLE
/* File descriptor is writable. */
#define EVENT_LOOP_WRITE AE_WRITABLE
/* Constant specifying that the timer is done and it will be removed. */
#define EVENT_LOOP_TIMER_DONE AE_NOMORE
/* Signature of the handler that will be called when there is a new event
* on the file descriptor that this handler has been registered for. The
* context is the one that was passed into add_file by the user. The
* events parameter indicates which event is available on the file,
* it can be EVENT_LOOP_READ or EVENT_LOOP_WRITE. */
typedef void (*event_loop_file_handler)(event_loop *loop,
int fd,
void *context,
int events);
/* This handler will be called when a timer times out. The id of the timer
* as well as the context that was specified when registering this handler
* are passed as arguments. The return is the number of milliseconds the
* timer shall be reset to or EVENT_LOOP_TIMER_DONE if the timer shall
* not be triggered again. */
typedef int (*event_loop_timer_handler)(event_loop *loop,
timer_id timer_id,
void *context);
/* Create and return a new event loop. */
event_loop *event_loop_create();
/* Deallocate space associated with the event loop that was created
* with the "create" function. */
void event_loop_destroy(event_loop *loop);
/* Register a handler that will be called any time a new event happens on
* a file descriptor. Can specify a context that will be passed as an
* argument to the handler. Currently there can only be one handler per file.
* The events parameter specifies which events we listen to: EVENT_LOOP_READ
* or EVENT_LOOP_WRITE. */
void event_loop_add_file(event_loop *loop,
int fd,
int events,
event_loop_file_handler handler,
void *context);
/* Remove a registered file event handler from the event loop. */
void event_loop_remove_file(event_loop *loop, int fd);
/* Register a handler that will be called after a time slice of
* "milliseconds" milliseconds. Can specify a context that will be passed
* as an argument to the handler. Return the id of the time event. */
int64_t event_loop_add_timer(event_loop *loop,
int64_t milliseconds,
event_loop_timer_handler handler,
void *context);
/* Remove a registered time event handler from the event loop. */
void event_loop_remove_timer(event_loop *loop, timer_id timer_id);
/* Run the event loop. */
void event_loop_run(event_loop *loop);
/* Stop the event loop. */
void event_loop_stop(event_loop *loop);
#endif
+333
View File
@@ -0,0 +1,333 @@
#include "io.h"
#include <stdlib.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <string.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdarg.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#include <utstring.h>
#include "common.h"
/**
* Binds to an Internet socket at the given port. Removes any existing file at
* the pathname. Returns a non-blocking file descriptor for the socket, or -1
* if an error occurred.
*
* @note Since the returned file descriptor is non-blocking, it is not
* recommended to use the Linux read and write calls directly, since these
* might read or write a partial message. Instead, use the provided
* write_message and read_message methods.
*
* @param port The port to bind to.
* @return A non-blocking file descriptor for the socket, or -1 if an error
* occurs.
*/
int bind_inet_sock(const int port) {
struct sockaddr_in name;
int socket_fd = socket(PF_INET, SOCK_STREAM, 0);
if (socket_fd < 0) {
LOG_ERR("socket() failed for port %d.", port);
return -1;
}
name.sin_family = AF_INET;
name.sin_port = htons(port);
name.sin_addr.s_addr = htonl(INADDR_ANY);
int on = 1;
/* TODO(pcm): http://stackoverflow.com/q/1150635 */
if (ioctl(socket_fd, FIONBIO, (char *) &on) < 0) {
LOG_ERR("ioctl failed");
close(socket_fd);
return -1;
}
if (setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
LOG_ERR("setsockopt failed for port %d", port);
close(socket_fd);
return -1;
}
if (bind(socket_fd, (struct sockaddr *) &name, sizeof(name)) < 0) {
LOG_ERR("Bind failed for port %d", port);
close(socket_fd);
return -1;
}
if (listen(socket_fd, 5) == -1) {
LOG_ERR("Could not listen to socket %d", port);
close(socket_fd);
return -1;
}
return socket_fd;
}
/**
* Binds to a Unix domain streaming socket at the given
* pathname. Removes any existing file at the pathname.
*
* @param socket_pathname The pathname for the socket.
* @return A blocking file descriptor for the socket, or -1 if an error
* occurs.
*/
int bind_ipc_sock(const char *socket_pathname) {
struct sockaddr_un socket_address;
int socket_fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (socket_fd < 0) {
LOG_ERR("socket() failed for pathname %s.", socket_pathname);
return -1;
}
/* Tell the system to allow the port to be reused. */
int on = 1;
if (setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, (char *) &on,
sizeof(on)) < 0) {
LOG_ERR("setsockopt failed for pathname %s", socket_pathname);
close(socket_fd);
return -1;
}
unlink(socket_pathname);
memset(&socket_address, 0, sizeof(struct sockaddr_un));
socket_address.sun_family = AF_UNIX;
if (strlen(socket_pathname) + 1 > sizeof(socket_address.sun_path)) {
LOG_ERR("Socket pathname is too long.");
close(socket_fd);
return -1;
}
strncpy(socket_address.sun_path, socket_pathname,
strlen(socket_pathname) + 1);
if (bind(socket_fd, (struct sockaddr *) &socket_address,
sizeof(struct sockaddr_un)) != 0) {
LOG_ERR("Bind failed for pathname %s.", socket_pathname);
close(socket_fd);
return -1;
}
if (listen(socket_fd, 5) == -1) {
LOG_ERR("Could not listen to socket %s", socket_pathname);
close(socket_fd);
return -1;
}
return socket_fd;
}
/**
* Connects to a Unix domain streaming socket at the given
* pathname. Returns a file descriptor for the socket, or -1 if
* an error occurred.
*/
int connect_ipc_sock(const char *socket_pathname) {
struct sockaddr_un socket_address;
int socket_fd;
socket_fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (socket_fd < 0) {
LOG_ERR("socket() failed for pathname %s.", socket_pathname);
return -1;
}
memset(&socket_address, 0, sizeof(struct sockaddr_un));
socket_address.sun_family = AF_UNIX;
if (strlen(socket_pathname) + 1 > sizeof(socket_address.sun_path)) {
LOG_ERR("Socket pathname is too long.");
return -1;
}
strncpy(socket_address.sun_path, socket_pathname,
strlen(socket_pathname) + 1);
if (connect(socket_fd, (struct sockaddr *) &socket_address,
sizeof(struct sockaddr_un)) != 0) {
LOG_ERR("Connection to socket failed for pathname %s.", socket_pathname);
return -1;
}
return socket_fd;
}
/**
* Accept a new client connection on the given socket
* descriptor. Returns a descriptor for the new socket.
*/
int accept_client(int socket_fd) {
int client_fd = accept(socket_fd, NULL, NULL);
if (client_fd < 0) {
LOG_ERR("Error reading from socket.");
return -1;
}
return client_fd;
}
/**
* Write a sequence of bytes into a file descriptor. This will block until one
* of the following happens: (1) there is an error (2) end of file, or (3) all
* length bytes have been written.
*
* @param fd The file descriptor to write to. It can be non-blocking.
* @param cursor The cursor pointing to the beginning of the bytes to send.
* @param length The size of the bytes sequence to write.
* @return int Whether there was an error while writing. 0 corresponds to
* success and -1 corresponds to an error (errno will be set).
*/
int write_bytes(int fd, uint8_t *cursor, size_t length) {
ssize_t nbytes = 0;
while (length > 0) {
/* While we haven't written the whole message, write to the file
* descriptor, advance the cursor, and decrease the amount left to write. */
nbytes = write(fd, cursor, length);
if (nbytes < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
continue;
}
/* TODO(swang): Return the error instead of exiting. */
/* Force an exit if there was any other type of error. */
CHECK(nbytes < 0);
}
if (nbytes == 0) {
return -1;
}
cursor += nbytes;
length -= nbytes;
}
return 0;
}
/**
* Write a sequence of bytes on a file descriptor. The bytes should then be read
* by read_message.
*
* @param fd The file descriptor to write to. It can be non-blocking.
* @param type The type of the message to send.
* @param length The size in bytes of the bytes parameter.
* @param bytes The address of the message to send.
* @return int Whether there was an error while writing. 0 corresponds to
* success and -1 corresponds to an error (errno will be set).
*/
int write_message(int fd, int64_t type, int64_t length, uint8_t *bytes) {
int closed;
closed = write_bytes(fd, (uint8_t *) &type, sizeof(type));
if (closed) {
return closed;
}
closed = write_bytes(fd, (uint8_t *) &length, sizeof(length));
if (closed) {
return closed;
}
closed = write_bytes(fd, bytes, length * sizeof(char));
if (closed) {
return closed;
}
return 0;
}
/**
* Read a sequence of bytes from a file descriptor into a buffer. This will
* block until one of the following happens: (1) there is an error (2) end of
* file, or (3) all length bytes have been written.
*
* @note The buffer pointed to by cursor must already have length number of
* bytes allocated before calling this method.
*
* @param fd The file descriptor to read from. It can be non-blocking.
* @param cursor The cursor pointing to the beginning of the buffer.
* @param length The size of the byte sequence to read.
* @return int Whether there was an error while writing. 0 corresponds to
* success and -1 corresponds to an error (errno will be set).
*/
int read_bytes(int fd, uint8_t *cursor, size_t length) {
ssize_t nbytes = 0;
while (length > 0) {
/* While we haven't read the whole message, read from the file descriptor,
* advance the cursor, and decrease the amount left to read. */
nbytes = read(fd, cursor, length);
if (nbytes < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
continue;
}
/* Force an exit if there was any other type of error. */
CHECK(nbytes < 0);
}
if (nbytes == 0) {
return -1;
}
cursor += nbytes;
length -= nbytes;
}
return 0;
}
/**
* Read a sequence of bytes written by write_message from a file descriptor.
* This allocates space for the message.
*
* @note The caller must free the memory.
*
* @param fd The file descriptor to read from. It can be non-blocking.
* @param type The type of the message that is read will be written at this
address. If there was an error while reading, this will be
DISCONNECT_CLIENT.
* @param length The size in bytes of the message that is read will be written
at this address. This size does not include the bytes used to encode
the type and length. If there was an error while reading, this will
be 0.
* @param bytes The address at which to write the pointer to the bytes that are
read and allocated by this function. If there was an error while
reading, this will be NULL.
* @return Void.
*/
void read_message(int fd, int64_t *type, int64_t *length, uint8_t **bytes) {
int closed = read_bytes(fd, (uint8_t *) type, sizeof(int64_t));
if (closed) {
goto disconnected;
}
closed = read_bytes(fd, (uint8_t *) length, sizeof(int64_t));
if (closed) {
goto disconnected;
}
*bytes = malloc(*length * sizeof(uint8_t));
closed = read_bytes(fd, *bytes, *length);
if (closed) {
free(*bytes);
goto disconnected;
}
return;
disconnected:
/* Handle the case in which the socket is closed. */
*type = DISCONNECT_CLIENT;
*length = 0;
*bytes = NULL;
return;
}
/* Write a null-terminated string to a file descriptor. */
void write_log_message(int fd, char *message) {
/* Account for the \0 at the end of the string. */
write_message(fd, LOG_MESSAGE, strlen(message) + 1, (uint8_t *) message);
}
/* Reads a null-terminated string from the file descriptor that has been
* written by write_log_message. Allocates and returns a pointer to the string.
* NOTE: Caller must free the memory! */
char *read_log_message(int fd) {
uint8_t *bytes;
int64_t type;
int64_t length;
read_message(fd, &type, &length, &bytes);
CHECK(type == LOG_MESSAGE);
return (char *) bytes;
}
void write_formatted_log_message(int socket_fd, const char *format, ...) {
UT_string *cmd;
va_list ap;
utstring_new(cmd);
va_start(ap, format);
utstring_printf_va(cmd, format, ap);
va_end(ap);
write_log_message(socket_fd, utstring_body(cmd));
utstring_free(cmd);
}
+32
View File
@@ -0,0 +1,32 @@
#ifndef IO_H
#define IO_H
#include <stdint.h>
enum common_message_type {
/** Disconnect a client. */
DISCONNECT_CLIENT,
/** Log a message from a client. */
LOG_MESSAGE,
/** Submit a task to the local scheduler. */
SUBMIT_TASK,
};
/* Helper functions for socket communication. */
int bind_inet_sock(const int port);
int bind_ipc_sock(const char *socket_pathname);
int connect_ipc_sock(const char *socket_pathname);
int accept_client(int socket_fd);
/* Reading and writing data */
int write_message(int fd, int64_t type, int64_t length, uint8_t *bytes);
void read_message(int fd, int64_t *type, int64_t *length, uint8_t **bytes);
void write_log_message(int fd, char *message);
void write_formatted_log_message(int fd, const char *format, ...);
char *read_log_message(int fd);
#endif
+344
View File
@@ -0,0 +1,344 @@
#include <Python.h>
#include "node.h"
#include "common_extension.h"
#include "task.h"
#include "utarray.h"
#include "utstring.h"
PyObject *CommonError;
#define MARSHAL_VERSION 2
/* Define the PyObjectID class. */
int PyObjectToUniqueID(PyObject *object, object_id *objectid) {
if (PyObject_IsInstance(object, (PyObject *) &PyObjectIDType)) {
*objectid = ((PyObjectID *) object)->object_id;
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be an ObjectID");
return 0;
}
}
static int PyObjectID_init(PyObjectID *self, PyObject *args, PyObject *kwds) {
const char *data;
int size;
if (!PyArg_ParseTuple(args, "s#", &data, &size)) {
return -1;
}
if (size != UNIQUE_ID_SIZE) {
PyErr_SetString(CommonError,
"ObjectID: object id string needs to have length 20");
return -1;
}
memcpy(&self->object_id.id[0], data, UNIQUE_ID_SIZE);
return 0;
}
/* Create a PyObjectID from C. */
PyObject *PyObjectID_make(object_id object_id) {
PyObjectID *result = PyObject_New(PyObjectID, &PyObjectIDType);
result = (PyObjectID *) PyObject_Init((PyObject *) result, &PyObjectIDType);
result->object_id = object_id;
return (PyObject *) result;
}
static PyObject *PyObjectID_id(PyObject *self) {
PyObjectID *s = (PyObjectID *) self;
return PyString_FromStringAndSize((char *) &s->object_id.id[0],
UNIQUE_ID_SIZE);
}
static PyObject *PyObjectID___reduce__(PyObjectID *self) {
PyErr_SetString(CommonError, "ObjectID objects cannot be serialized.");
return NULL;
}
static PyMethodDef PyObjectID_methods[] = {
{"id", (PyCFunction) PyObjectID_id, METH_NOARGS,
"Return the hash associated with this ObjectID"},
{"__reduce__", (PyCFunction) PyObjectID___reduce__, METH_NOARGS,
"Say how to pickle this ObjectID. This raises an exception to prevent"
"object IDs from being serialized."},
{NULL} /* Sentinel */
};
static PyMemberDef PyObjectID_members[] = {
{NULL} /* Sentinel */
};
PyTypeObject PyObjectIDType = {
PyObject_HEAD_INIT(NULL) 0, /* ob_size */
"common.ObjectID", /* tp_name */
sizeof(PyObjectID), /* tp_basicsize */
0, /* tp_itemsize */
0, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"ObjectID object", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
PyObjectID_methods, /* tp_methods */
PyObjectID_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) PyObjectID_init, /* tp_init */
0, /* tp_alloc */
PyType_GenericNew, /* tp_new */
};
/* Define the PyTask class. */
static int PyTask_init(PyTask *self, PyObject *args, PyObject *kwds) {
function_id function_id;
/* Arguments of the task (can be PyObjectIDs or Python values). */
PyObject *arguments;
/* Array of pointers to string representations of pass-by-value args. */
UT_array *val_repr_ptrs;
utarray_new(val_repr_ptrs, &ut_ptr_icd);
int num_returns;
if (!PyArg_ParseTuple(args, "O&Oi", &PyObjectToUniqueID, &function_id,
&arguments, &num_returns)) {
return -1;
}
size_t size = PyList_Size(arguments);
/* Determine the size of pass by value data in bytes. */
size_t value_data_bytes = 0;
for (size_t i = 0; i < size; ++i) {
PyObject *arg = PyList_GetItem(arguments, i);
if (!PyObject_IsInstance(arg, (PyObject *) &PyObjectIDType)) {
PyObject *data = PyMarshal_WriteObjectToString(arg, MARSHAL_VERSION);
value_data_bytes += PyString_Size(data);
utarray_push_back(val_repr_ptrs, &data);
}
}
/* Construct the task specification. */
int val_repr_index = 0;
self->spec =
alloc_task_spec(function_id, size, num_returns, value_data_bytes);
/* Add the task arguments. */
for (size_t i = 0; i < size; ++i) {
PyObject *arg = PyList_GetItem(arguments, i);
if (PyObject_IsInstance(arg, (PyObject *) &PyObjectIDType)) {
task_args_add_ref(self->spec, ((PyObjectID *) arg)->object_id);
} else {
PyObject *data =
*((PyObject **) utarray_eltptr(val_repr_ptrs, val_repr_index));
task_args_add_val(self->spec, (uint8_t *) PyString_AS_STRING(data),
PyString_GET_SIZE(data));
Py_DECREF(data);
val_repr_index += 1;
}
}
utarray_free(val_repr_ptrs);
/* Generate and add the object IDs for the return values. */
for (size_t i = 0; i < num_returns; ++i) {
/* TODO(rkn): Later, this should be computed as a deterministic hash of (1)
* the contents of the task, (2) the index i, and (3) a counter of the
* number of tasks launched so far by the parent task. For now, we generate
* it randomly. */
*task_return(self->spec, i) = globally_unique_id();
}
return 0;
}
static void PyTask_dealloc(PyTask *self) {
free_task_spec(self->spec);
Py_TYPE(self)->tp_free((PyObject *) self);
}
static PyObject *PyTask_function_id(PyObject *self) {
function_id function_id = *task_function(((PyTask *) self)->spec);
return PyObjectID_make(function_id);
}
static PyObject *PyTask_arguments(PyObject *self) {
int64_t num_args = task_num_args(((PyTask *) self)->spec);
PyObject *arg_list = PyList_New((Py_ssize_t) num_args);
task_spec *task = ((PyTask *) self)->spec;
for (int i = 0; i < num_args; ++i) {
if (task_arg_type(task, i) == ARG_BY_REF) {
object_id object_id = *task_arg_id(task, i);
PyList_SetItem(arg_list, i, PyObjectID_make(object_id));
} else {
PyObject *s =
PyMarshal_ReadObjectFromString((char *) task_arg_val(task, i),
(Py_ssize_t) task_arg_length(task, i));
PyList_SetItem(arg_list, i, s);
}
}
return arg_list;
}
static PyObject *PyTask_returns(PyObject *self) {
int64_t num_returns = task_num_returns(((PyTask *) self)->spec);
PyObject *return_id_list = PyList_New((Py_ssize_t) num_returns);
task_spec *task = ((PyTask *) self)->spec;
for (int i = 0; i < num_returns; ++i) {
object_id object_id = *task_return(task, i);
PyList_SetItem(return_id_list, i, PyObjectID_make(object_id));
}
return return_id_list;
}
static PyMethodDef PyTask_methods[] = {
{"function_id", (PyCFunction) PyTask_function_id, METH_NOARGS,
"Return the function ID for this task."},
{"arguments", (PyCFunction) PyTask_arguments, METH_NOARGS,
"Return the arguments for the task."},
{"returns", (PyCFunction) PyTask_returns, METH_NOARGS,
"Return the object IDs for the return values of the task."},
{NULL} /* Sentinel */
};
PyTypeObject PyTaskType = {
PyObject_HEAD_INIT(NULL) 0, /* ob_size */
"task.Task", /* tp_name */
sizeof(PyTask), /* tp_basicsize */
0, /* tp_itemsize */
(destructor) PyTask_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Task object", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
PyTask_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc) PyTask_init, /* tp_init */
0, /* tp_alloc */
PyType_GenericNew, /* tp_new */
};
/* Create a PyTask from a C struct. The resulting PyTask takes ownership of the
* task_spec and will deallocate the task_spec in the PyTask destructor. */
PyObject *PyTask_make(task_spec *task_spec) {
PyTask *result = PyObject_New(PyTask, &PyTaskType);
result = (PyTask *) PyObject_Init((PyObject *) result, &PyTaskType);
result->spec = task_spec;
return (PyObject *) result;
}
/* Define the methods for the module. */
#define SIZE_LIMIT 100
#define NUM_ELEMENTS_LIMIT 1000
/**
* This method checks if a Python object is sufficiently simple that it can be
* serialized and passed by value as an argument to a task (without being put in
* the object store). The details of which objects are sufficiently simple are
* defined by this method and are not particularly important. But for
* performance reasons, it is better to place "small" objects in the task itself
* and "large" objects in the object store.
*
* @param value The Python object in question.
* @param num_elements_contained If this method returns 1, then the number of
* objects recursively contained within this object will be added to the
* value at this address. This is used to make sure that we do not
* serialize objects that are too large.
* @return 0 if the object cannot be serialized in the task and 1 if it can.
*/
int is_simple_value(PyObject *value, int *num_elements_contained) {
*num_elements_contained += 1;
if (*num_elements_contained >= NUM_ELEMENTS_LIMIT) {
return 0;
}
if (PyInt_Check(value) || PyLong_Check(value) || value == Py_False ||
value == Py_True || PyFloat_Check(value) || value == Py_None) {
return 1;
}
if (PyString_CheckExact(value)) {
*num_elements_contained += PyString_Size(value);
return (*num_elements_contained < NUM_ELEMENTS_LIMIT);
}
if (PyUnicode_CheckExact(value)) {
*num_elements_contained += PyUnicode_GET_SIZE(value);
return (*num_elements_contained < NUM_ELEMENTS_LIMIT);
}
if (PyList_CheckExact(value) && PyList_Size(value) < SIZE_LIMIT) {
for (size_t i = 0; i < PyList_Size(value); ++i) {
if (!is_simple_value(PyList_GetItem(value, i), num_elements_contained)) {
return 0;
}
}
return (*num_elements_contained < NUM_ELEMENTS_LIMIT);
}
if (PyDict_CheckExact(value) && PyDict_Size(value) < SIZE_LIMIT) {
PyObject *key, *val;
Py_ssize_t pos = 0;
while (PyDict_Next(value, &pos, &key, &val)) {
if (!is_simple_value(key, num_elements_contained) ||
!is_simple_value(val, num_elements_contained)) {
return 0;
}
}
return (*num_elements_contained < NUM_ELEMENTS_LIMIT);
}
if (PyTuple_CheckExact(value) && PyTuple_Size(value) < SIZE_LIMIT) {
for (size_t i = 0; i < PyTuple_Size(value); ++i) {
if (!is_simple_value(PyTuple_GetItem(value, i), num_elements_contained)) {
return 0;
}
}
return (*num_elements_contained < NUM_ELEMENTS_LIMIT);
}
return 0;
}
PyObject *check_simple_value(PyObject *self, PyObject *args) {
PyObject *value;
if (!PyArg_ParseTuple(args, "O", &value)) {
return NULL;
}
int num_elements_contained = 0;
if (is_simple_value(value, &num_elements_contained)) {
Py_RETURN_TRUE;
}
Py_RETURN_FALSE;
}
+37
View File
@@ -0,0 +1,37 @@
#ifndef COMMON_EXTENSION_H
#define COMMON_EXTENSION_H
#include <Python.h>
#include "marshal.h"
#include "structmember.h"
#include "common.h"
#include "task.h"
extern PyObject *CommonError;
// clang-format off
typedef struct {
PyObject_HEAD
object_id object_id;
} PyObjectID;
typedef struct {
PyObject_HEAD
task_spec *spec;
} PyTask;
// clang-format on
extern PyTypeObject PyObjectIDType;
extern PyTypeObject PyTaskType;
int PyObjectToUniqueID(PyObject *object, object_id *objectid);
PyObject *PyObjectID_make(object_id object_id);
PyObject *check_simple_value(PyObject *self, PyObject *args);
PyObject *PyTask_make(task_spec *task_spec);
#endif /* COMMON_EXTENSION_H */
+38
View File
@@ -0,0 +1,38 @@
#include <Python.h>
#include "node.h"
#include "common_extension.h"
static PyMethodDef common_methods[] = {
{"check_simple_value", check_simple_value, METH_VARARGS,
"Should the object be passed by value?"},
{NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
PyMODINIT_FUNC initcommon(void) {
PyObject *m;
if (PyType_Ready(&PyTaskType) < 0)
return;
if (PyType_Ready(&PyObjectIDType) < 0)
return;
m = Py_InitModule3("common", common_methods,
"A module for common types. This is used for testing.");
Py_INCREF(&PyTaskType);
PyModule_AddObject(m, "Task", (PyObject *) &PyTaskType);
Py_INCREF(&PyObjectIDType);
PyModule_AddObject(m, "ObjectID", (PyObject *) &PyObjectIDType);
char common_error[] = "common.error";
CommonError = PyErr_NewException(common_error, NULL, NULL);
Py_INCREF(CommonError);
PyModule_AddObject(m, "common_error", CommonError);
}
+12
View File
@@ -0,0 +1,12 @@
from setuptools import setup, find_packages, Extension
common_module = Extension("common",
sources=["common_module.c", "common_extension.c"],
include_dirs=["../../", "../../thirdparty"],
extra_objects=["../../build/libcommon.a"],
extra_compile_args=["--std=c99", "-Werror"])
setup(name="Common",
version="0.1",
description="Common library for Ray",
ext_modules=[common_module])
+80
View File
@@ -0,0 +1,80 @@
#include "logging.h"
#include <stdint.h>
#include <inttypes.h>
#include <hiredis/hiredis.h>
#include <utstring.h>
#include "state/redis.h"
#include "io.h"
static const char *log_levels[5] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL"};
static const char *log_fmt =
"HMSET log:%s:%s:%s log_level %s event_type %s message %s timestamp %s";
struct ray_logger_impl {
/* String that identifies this client type. */
const char *client_type;
/* Suppress all log messages below this level. */
int log_level;
/* Whether or not we have a direct connection to Redis. */
int is_direct;
/* Either a db_handle or a socket to a process with a db_handle,
* depending on the is_direct flag. */
void *conn;
};
ray_logger *init_ray_logger(const char *client_type,
int log_level,
int is_direct,
void *conn) {
ray_logger *logger = malloc(sizeof(ray_logger));
logger->client_type = client_type;
logger->log_level = log_level;
logger->is_direct = is_direct;
logger->conn = conn;
return logger;
}
void free_ray_logger(ray_logger *logger) {
free(logger);
}
void ray_log(ray_logger *logger,
int log_level,
const char *event_type,
const char *message) {
if (log_level < logger->log_level) {
return;
}
if (log_level < RAY_DEBUG || log_level > RAY_FATAL) {
return;
}
struct timeval tv;
UT_string *timestamp;
utstring_new(timestamp);
gettimeofday(&tv, NULL);
utstring_printf(timestamp, "%ld.%ld", tv.tv_sec, (long) tv.tv_usec);
UT_string *origin_id;
utstring_new(origin_id);
if (logger->is_direct) {
db_handle *db = (db_handle *) logger->conn;
utstring_printf(origin_id, "%" PRId64 ":%s", db->client_id, "");
redisAsyncCommand(db->context, NULL, NULL, log_fmt,
utstring_body(timestamp), logger->client_type,
utstring_body(origin_id), log_levels[log_level],
event_type, message, utstring_body(timestamp));
} else {
/* If we don't own a Redis connection, we leave our client
* ID to be filled in by someone else. */
utstring_printf(origin_id, "%s:%s", "%ld", "%ld");
int *socket_fd = (int *) logger->conn;
write_formatted_log_message(*socket_fd, log_fmt, utstring_body(timestamp),
logger->client_type, utstring_body(origin_id),
log_levels[log_level], event_type, message,
utstring_body(timestamp));
}
utstring_free(origin_id);
utstring_free(timestamp);
}
+39
View File
@@ -0,0 +1,39 @@
#ifndef LOGGING_H
#define LOGGING_H
#define RAY_VERBOSE -1
#define RAY_DEBUG 0
#define RAY_INFO 1
#define RAY_WARNING 2
#define RAY_ERROR 3
#define RAY_FATAL 4
/* Entity types. */
#define RAY_FUNCTION "FUNCTION"
#define RAY_OBJECT "OBJECT"
#define RAY_TASK "TASK"
typedef struct ray_logger_impl ray_logger;
/* Initialize a Ray logger for the given client type and logging level. If the
* is_direct flag is set, the logger will treat the given connection as a
* direct connection to the log. Otherwise, it will treat it as a socket to
* another process with a connection to the log.
* NOTE: User is responsible for freeing the returned logger. */
ray_logger *init_ray_logger(const char *client_type,
int log_level,
int is_direct,
void *conn);
/* Free the logger. This does not free the connection to the log. */
void free_ray_logger(ray_logger *logger);
/* Log an event at the given log level with the given event_type.
* NOTE: message cannot contain spaces! JSON format is recommended.
* TODO: Support spaces in messages. */
void ray_log(ray_logger *logger,
int log_level,
const char *event_type,
const char *message);
#endif
+32
View File
@@ -0,0 +1,32 @@
#ifndef DB_H
#define DB_H
#include "event_loop.h"
typedef struct db_handle_impl db_handle;
/* Connect to the global system store at address and port. Returns
* a handle to the database, which must be freed with db_disconnect
* after use. */
db_handle *db_connect(const char *db_address,
int db_port,
const char *client_type,
const char *client_addr,
int client_port);
/* Attach global system store connection to event loop. */
void db_attach(db_handle *db, event_loop *loop);
/* Disconnect from the global system store. */
void db_disconnect(db_handle *db);
/**
* Returns the client ID, according to the database.
*
* @param db The handle to the database.
* @returns int The client ID for this connection to the database. If
* this client has no connection to the database, returns -1.
*/
int get_client_id(db_handle *db);
#endif
+25
View File
@@ -0,0 +1,25 @@
#include "common.h"
#include "db.h"
/* The callback that is called when the result of a lookup
* in the object table comes back. The callback should free
* the manager_vector array, but NOT the strings they are pointing to. */
typedef void (*lookup_callback)(object_id object_id,
int manager_count,
const char *manager_vector[],
void *context);
/* Register a new object with the directory. */
/* TODO(pcm): Retry, print for each attempt. */
void object_table_add(db_handle *db, object_id object_id);
/* Remove object from the directory. */
void object_table_remove(db_handle *db,
object_id object_id,
const char *manager);
/* Look up entry from the directory */
void object_table_lookup(db_handle *db,
object_id object_id,
lookup_callback callback,
void *context);
+244
View File
@@ -0,0 +1,244 @@
/* Redis implementation of the global state store */
#include <assert.h>
#include <stdlib.h>
#include "hiredis/adapters/ae.h"
#include "utstring.h"
#include "common.h"
#include "db.h"
#include "object_table.h"
#include "task_log.h"
#include "event_loop.h"
#include "redis.h"
#include "io.h"
#define LOG_REDIS_ERR(context, M, ...) \
fprintf(stderr, "[ERROR] (%s:%d: message: %s) " M "\n", __FILE__, __LINE__, \
context->errstr, ##__VA_ARGS__)
#define CHECK_REDIS_CONNECT(CONTEXT_TYPE, context, M, ...) \
do { \
CONTEXT_TYPE *_context = (context); \
if (!_context) { \
LOG_ERR("could not allocate redis context"); \
exit(-1); \
} \
if (_context->err) { \
LOG_REDIS_ERR(_context, M, ##__VA_ARGS__); \
exit(-1); \
} \
} while (0);
db_handle *db_connect(const char *address,
int port,
const char *client_type,
const char *client_addr,
int client_port) {
db_handle *db = malloc(sizeof(db_handle));
/* Sync connection for initial handshake */
redisReply *reply;
long long num_clients;
redisContext *context = redisConnect(address, port);
CHECK_REDIS_CONNECT(redisContext, context, "could not connect to redis %s:%d",
address, port);
/* Add new client using optimistic locking. */
while (1) {
reply = redisCommand(context, "WATCH %s", client_type);
freeReplyObject(reply);
reply = redisCommand(context, "HLEN %s", client_type);
num_clients = reply->integer;
freeReplyObject(reply);
reply = redisCommand(context, "MULTI");
freeReplyObject(reply);
reply = redisCommand(context, "HSET %s %lld %s:%d", client_type,
num_clients, client_addr, client_port);
freeReplyObject(reply);
reply = redisCommand(context, "EXEC");
CHECK(reply);
if (reply->type != REDIS_REPLY_NIL) {
freeReplyObject(reply);
break;
}
freeReplyObject(reply);
}
db->client_type = strdup(client_type);
db->client_id = num_clients;
db->service_cache = NULL;
db->sync_context = context;
utarray_new(db->callback_freelist, &ut_ptr_icd);
/* Establish async connection */
db->context = redisAsyncConnect(address, port);
CHECK_REDIS_CONNECT(redisAsyncContext, db->context,
"could not connect to redis %s:%d", address, port);
db->context->data = (void *) db;
/* Establish async connection for subscription */
db->sub_context = redisAsyncConnect(address, port);
CHECK_REDIS_CONNECT(redisAsyncContext, db->sub_context,
"could not connect to redis %s:%d", address, port);
db->sub_context->data = (void *) db;
return db;
}
void db_disconnect(db_handle *db) {
redisFree(db->sync_context);
redisAsyncFree(db->context);
redisAsyncFree(db->sub_context);
service_cache_entry *e, *tmp;
HASH_ITER(hh, db->service_cache, e, tmp) {
free(e->addr);
HASH_DEL(db->service_cache, e);
free(e);
}
free(db->client_type);
void **p = NULL;
while ((p = (void **) utarray_next(db->callback_freelist, p))) {
free(*p);
}
utarray_free(db->callback_freelist);
free(db);
}
void db_attach(db_handle *db, event_loop *loop) {
redisAeAttach(loop, db->context);
redisAeAttach(loop, db->sub_context);
}
void object_table_add(db_handle *db, unique_id object_id) {
redisAsyncCommand(db->context, NULL, NULL, "SADD obj:%b %d", &object_id.id[0],
UNIQUE_ID_SIZE, db->client_id);
if (db->context->err) {
LOG_REDIS_ERR(db->context, "could not add object_table entry");
}
}
void object_table_get_entry(redisAsyncContext *c, void *r, void *privdata) {
db_handle *db = c->data;
lookup_callback_data *cb_data = privdata;
redisReply *reply = r;
if (reply == NULL)
return;
int *result = malloc(reply->elements * sizeof(int));
int64_t manager_count = reply->elements;
if (reply->type == REDIS_REPLY_ARRAY) {
for (int j = 0; j < reply->elements; j++) {
CHECK(reply->element[j]->type == REDIS_REPLY_STRING);
result[j] = atoi(reply->element[j]->str);
service_cache_entry *entry;
HASH_FIND_INT(db->service_cache, &result[j], entry);
if (!entry) {
redisReply *reply = redisCommand(db->sync_context, "HGET %s %lld",
db->client_type, result[j]);
CHECK(reply->type == REDIS_REPLY_STRING);
entry = malloc(sizeof(service_cache_entry));
entry->service_id = result[j];
entry->addr = strdup(reply->str);
HASH_ADD_INT(db->service_cache, service_id, entry);
freeReplyObject(reply);
}
}
} else {
LOG_ERR("expected integer or string, received type %d", reply->type);
exit(-1);
}
const char **manager_vector = malloc(manager_count * sizeof(char *));
for (int j = 0; j < manager_count; ++j) {
service_cache_entry *entry;
HASH_FIND_INT(db->service_cache, &result[j], entry);
manager_vector[j] = entry->addr;
}
cb_data->callback(cb_data->object_id, manager_count, manager_vector,
cb_data->context);
free(privdata);
free(result);
}
void object_table_lookup(db_handle *db,
object_id object_id,
lookup_callback callback,
void *context) {
lookup_callback_data *cb_data = malloc(sizeof(lookup_callback_data));
cb_data->callback = callback;
cb_data->object_id = object_id;
cb_data->context = context;
redisAsyncCommand(db->context, object_table_get_entry, cb_data,
"SMEMBERS obj:%b", &object_id.id[0], UNIQUE_ID_SIZE);
if (db->context->err) {
LOG_REDIS_ERR(db->context, "error in object_table lookup");
}
}
void task_log_add_task(db_handle *db, task_instance *task_instance) {
task_iid task_iid = *task_instance_id(task_instance);
redisAsyncCommand(db->context, NULL, NULL, "HMSET tasklog:%b 0 %b",
(char *) &task_iid.id[0], UNIQUE_ID_SIZE,
(char *) task_instance, task_instance_size(task_instance));
if (db->context->err) {
LOG_REDIS_ERR(db->context, "error setting task in task_log_add_task");
}
node_id node = *task_instance_node(task_instance);
int32_t state = *task_instance_state(task_instance);
redisAsyncCommand(db->context, NULL, NULL, "PUBLISH task_log:%b:%d %b",
(char *) &node.id[0], UNIQUE_ID_SIZE, state,
(char *) task_instance, task_instance_size(task_instance));
if (db->context->err) {
LOG_REDIS_ERR(db->context, "error publishing task in task_log_add_task");
}
}
void task_log_redis_callback(redisAsyncContext *c,
void *reply,
void *privdata) {
redisReply *r = reply;
if (reply == NULL)
return;
CHECK(r->type == REDIS_REPLY_ARRAY);
/* First entry is message type, second is topic, third is payload. */
CHECK(r->elements > 2);
/* If this condition is true, we got the initial message that acknowledged the
* subscription. */
if (r->element[2]->str == NULL) {
return;
}
/* Otherwise, parse the task and call the callback. */
CHECK(privdata);
task_log_callback_data *callback_data = privdata;
task_instance *instance = malloc(r->element[2]->len);
memcpy(instance, r->element[2]->str, r->element[2]->len);
callback_data->callback(instance, callback_data->userdata);
task_instance_free(instance);
}
void task_log_register_callback(db_handle *db,
task_log_callback callback,
node_id node,
int32_t state,
void *userdata) {
task_log_callback_data *callback_data =
malloc(sizeof(task_log_callback_data));
utarray_push_back(db->callback_freelist, &callback_data);
callback_data->callback = callback;
callback_data->userdata = userdata;
if (memcmp(&node.id[0], &NIL_ID.id[0], UNIQUE_ID_SIZE) == 0) {
redisAsyncCommand(db->sub_context, task_log_redis_callback, callback_data,
"PSUBSCRIBE task_log:*:%d", state);
} else {
redisAsyncCommand(db->sub_context, task_log_redis_callback, callback_data,
"SUBSCRIBE task_log:%b:%d", (char *) &node.id[0],
UNIQUE_ID_SIZE, state);
}
if (db->sub_context->err) {
LOG_REDIS_ERR(db->sub_context, "error in task_log_register_callback");
}
}
int get_client_id(db_handle *db) {
if (db) {
return db->client_id;
} else {
return -1;
}
}
+68
View File
@@ -0,0 +1,68 @@
#ifndef REDIS_H
#define REDIS_H
#include "db.h"
#include "object_table.h"
#include "task_log.h"
#include "hiredis/hiredis.h"
#include "hiredis/async.h"
#include "uthash.h"
#include "utarray.h"
typedef struct {
/* Unique ID for this service. */
int service_id;
/* IP address and port of this service. */
char *addr;
/* Handle for the uthash table. */
UT_hash_handle hh;
} service_cache_entry;
typedef struct {
/* The callback that will be called. */
task_log_callback callback;
/* Userdata associated with the callback. */
void *userdata;
} task_log_callback_data;
struct db_handle_impl {
/* String that identifies this client type. */
char *client_type;
/* Unique ID for this client within the type. */
int64_t client_id;
/* Redis context for this global state store connection. */
redisAsyncContext *context;
/* Redis context for "subscribe" communication.
* Yes, we need a separate one for that, see
* https://github.com/redis/hiredis/issues/55 */
redisAsyncContext *sub_context;
/* The event loop this global state store connection is part of. */
event_loop *loop;
/* Index of the database connection in the event loop */
int64_t db_index;
/* Cache for the IP addresses of services. */
service_cache_entry *service_cache;
/* Redis context for synchronous connections.
* Should only be used very rarely, it is not asynchronous. */
redisContext *sync_context;
/* Data structure for callbacks that needs to be freed. */
UT_array *callback_freelist;
};
typedef struct {
/* The callback that will be called. */
lookup_callback callback;
/* Object ID that is looked up. */
object_id object_id;
/* Data context for the callback. */
void *context;
} lookup_callback_data;
void object_table_get_entry(redisAsyncContext *c, void *r, void *privdata);
void object_table_lookup_callback(redisAsyncContext *c,
void *r,
void *privdata);
#endif
+41
View File
@@ -0,0 +1,41 @@
#ifndef TASK_LOG_H
#define TASK_LOG_H
#include "db.h"
#include "task.h"
/* The task log is a message bus that is used for all communication between
* local and global schedulers (and also persisted to the state database).
* Here are examples of events that are recorded by the task log:
*
* 1) local scheduler writes it when submits a task to the global scheduler;
* 2) global scheduler reads it to get the task submitted by local schedulers;
* 3) global scheduler writes it when assigning the task to a local scheduler;
* 4) local scheduler reads it to get its tasks assigned by global scheduler;
* 5) local scheduler writes it when a task finishes execution;
* 6) global scheduler reads it to get the tasks that have finished; */
/* Callback for subscribing to the task log. */
typedef void (*task_log_callback)(task_instance *task_instance, void *userdata);
/* Initially add a task instance to the task log. */
void task_log_add_task(db_handle *db, task_instance *task_instance);
/* Update task instance in the task log. */
void task_log_update_task(db_handle *db,
task_iid task_iid,
int32_t state,
node_id node);
/* Register callback for a certain event. The node specifies the node whose
* events we want to listen to. If you want to listen to all events for this
* node, use state_filter =
* TASK_WAITING | TASK_SCHEDULED | TASK_RUNNING | TASK_DONE.
* If you want to register to updates from all nodes, set node = NIL_ID. */
void task_log_register_callback(db_handle *db,
task_log_callback callback,
node_id node,
int32_t state_filter,
void *userdata);
#endif /* TASK_LOG_H */
+20
View File
@@ -0,0 +1,20 @@
#ifndef TASK_TABLE_H
#define TASK_TABLE_H
#include "db.h"
#include "task.h"
/* Add task to the task table, handle errors here. */
status task_table_add_task(db_handle *db, task_spec *task);
/* Callback for getting an entry from the task table. Task spec will be freed
* by the system after the callback */
typedef void (*task_table_callback)(task_spec *task, void *context);
/* Get specific task from the task table. */
status task_table_get_task(db_handle *db,
task_id task_id,
task_table_callback callback,
void *context);
#endif /* TASK_TABLE_H */
+218
View File
@@ -0,0 +1,218 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "utarray.h"
#include "task.h"
#include "common.h"
#include "io.h"
/* TASK SPECIFICATIONS */
/* Tasks are stored in a consecutive chunk of memory, the first
* sizeof(task_spec) bytes are arranged according to the struct
* task_spec. Then there is an array of task_args of length
* (num_args + num_returns), and then follows the data of
* pass-by-value arguments of size args_value_size. The offsets in the
* task_arg.val are with respect to the end of the augmented structure,
* i.e. with respect to the address &task_spec.args_and_returns[0] +
* (task_spec->num_args + task_spec->num_returns) * sizeof(task_arg). */
typedef struct {
/* Either ARG_BY_REF or ARG_BY_VAL. */
int8_t type;
union {
object_id obj_id;
struct {
/* Offset where the data associated to this arg is located relative
* to &task_spec.args_and_returns[0]. */
ptrdiff_t offset;
int64_t length;
} value;
};
} task_arg;
struct task_spec_impl {
/* Function ID of the task. */
function_id function_id;
/* Total number of arguments. */
int64_t num_args;
/* Index of the last argument that has been constructed. */
int64_t arg_index;
/* Number of return values. */
int64_t num_returns;
/* Number of bytes the pass-by-value arguments are occupying. */
int64_t args_value_size;
/* The offset of the number of bytes of pass-by-value data that
* has been written so far, relative to &task_spec->args_and_returns[0] +
* (task_spec->num_args + task_spec->num_returns) * sizeof(task_arg) */
int64_t args_value_offset;
/* Argument and return IDs as well as offsets for pass-by-value args. */
task_arg args_and_returns[0];
};
/* The size of a task specification is given by the following expression. */
#define TASK_SPEC_SIZE(NUM_ARGS, NUM_RETURNS, ARGS_VALUE_SIZE) \
(sizeof(task_spec) + ((NUM_ARGS) + (NUM_RETURNS)) * sizeof(task_arg) + \
(ARGS_VALUE_SIZE))
task_spec *alloc_task_spec(function_id function_id,
int64_t num_args,
int64_t num_returns,
int64_t args_value_size) {
int64_t size = TASK_SPEC_SIZE(num_args, num_returns, args_value_size);
task_spec *task = malloc(size);
memset(task, 0, size);
task->function_id = function_id;
task->num_args = num_args;
task->arg_index = 0;
task->num_returns = num_returns;
task->args_value_size = args_value_size;
return task;
}
int64_t task_size(task_spec *spec) {
return TASK_SPEC_SIZE(spec->num_args, spec->num_returns,
spec->args_value_size);
}
unique_id *task_function(task_spec *spec) {
return &spec->function_id;
}
int64_t task_num_args(task_spec *spec) {
return spec->num_args;
}
int64_t task_num_returns(task_spec *spec) {
return spec->num_returns;
}
int8_t task_arg_type(task_spec *spec, int64_t arg_index) {
CHECK(0 <= arg_index && arg_index < spec->num_args);
return spec->args_and_returns[arg_index].type;
}
object_id *task_arg_id(task_spec *spec, int64_t arg_index) {
CHECK(0 <= arg_index && arg_index < spec->num_args);
task_arg *arg = &spec->args_and_returns[arg_index];
CHECK(arg->type == ARG_BY_REF)
return &arg->obj_id;
}
uint8_t *task_arg_val(task_spec *spec, int64_t arg_index) {
CHECK(0 <= arg_index && arg_index < spec->num_args);
task_arg *arg = &spec->args_and_returns[arg_index];
CHECK(arg->type == ARG_BY_VAL);
uint8_t *data = (uint8_t *) &spec->args_and_returns[0];
data += (spec->num_args + spec->num_returns) * sizeof(task_arg);
return data + arg->value.offset;
}
int64_t task_arg_length(task_spec *spec, int64_t arg_index) {
CHECK(0 <= arg_index && arg_index < spec->num_args);
task_arg *arg = &spec->args_and_returns[arg_index];
CHECK(arg->type == ARG_BY_VAL);
return arg->value.length;
}
int64_t task_args_add_ref(task_spec *spec, object_id obj_id) {
task_arg *arg = &spec->args_and_returns[spec->arg_index];
arg->type = ARG_BY_REF;
arg->obj_id = obj_id;
return spec->arg_index++;
}
int64_t task_args_add_val(task_spec *spec, uint8_t *data, int64_t length) {
task_arg *arg = &spec->args_and_returns[spec->arg_index];
arg->type = ARG_BY_VAL;
arg->value.offset = spec->args_value_offset;
arg->value.length = length;
uint8_t *addr = task_arg_val(spec, spec->arg_index);
CHECK(spec->args_value_offset + length <= spec->args_value_size);
CHECK(spec->arg_index != spec->num_args - 1 ||
spec->args_value_offset + length == spec->args_value_size);
memcpy(addr, data, length);
spec->args_value_offset += length;
return spec->arg_index++;
}
object_id *task_return(task_spec *spec, int64_t ret_index) {
CHECK(0 <= ret_index && ret_index < spec->num_returns);
task_arg *ret = &spec->args_and_returns[spec->num_args + ret_index];
CHECK(ret->type == ARG_BY_REF); /* No memory corruption. */
return &ret->obj_id;
}
void free_task_spec(task_spec *spec) {
CHECK(spec->arg_index == spec->num_args); /* Task was fully constructed */
free(spec);
}
void print_task(task_spec *spec, UT_string *output) {
/* For converting an id to hex, which has double the number
* of bytes compared to the id (+ 1 byte for '\0'). */
static char hex[2 * UNIQUE_ID_SIZE + 1];
/* Print function id. */
sha1_to_hex(&task_function(spec)->id[0], &hex[0]);
utstring_printf(output, "fun %s ", &hex[0]);
/* Print arguments. */
for (int i = 0; i < task_num_args(spec); ++i) {
sha1_to_hex(&task_arg_id(spec, i)->id[0], &hex[0]);
utstring_printf(output, " id:%d %s", i, &hex[0]);
}
/* Print return ids. */
for (int i = 0; i < task_num_returns(spec); ++i) {
object_id *object_id = task_return(spec, i);
sha1_to_hex(&object_id->id[0], &hex[0]);
utstring_printf(output, " ret:%d %s", i, &hex[0]);
}
}
/* TASK INSTANCES */
struct task_instance_impl {
task_iid iid;
int32_t state;
node_id node;
task_spec spec;
};
task_instance *make_task_instance(task_iid task_iid,
task_spec *spec,
int32_t state,
node_id node) {
int64_t size = sizeof(task_instance) - sizeof(task_spec) + task_size(spec);
task_instance *result = malloc(size);
memset(result, 0, size);
result->iid = task_iid;
result->state = state;
result->node = node;
memcpy(&result->spec, spec, task_size(spec));
return result;
}
int64_t task_instance_size(task_instance *instance) {
return sizeof(task_instance) - sizeof(task_spec) + task_size(&instance->spec);
}
task_iid *task_instance_id(task_instance *instance) {
return &instance->iid;
}
int32_t *task_instance_state(task_instance *instance) {
return &instance->state;
}
node_id *task_instance_node(task_instance *instance) {
return &instance->node;
}
task_spec *task_instance_task_spec(task_instance *instance) {
return &instance->spec;
}
void task_instance_free(task_instance *instance) {
free(instance);
}
+132
View File
@@ -0,0 +1,132 @@
#ifndef TASK_H
#define TASK_H
/* This API specifies the task data structures. It is in C so we can
* easily construct tasks from other languages like Python. The datastructures
* are also defined in such a way that memory is contiguous and all pointers
* are relative, so that we can memcpy the datastructure and ship it over the
* network without serialization and deserialization. */
#include <stddef.h>
#include <stdint.h>
#include "common.h"
#include "utstring.h"
typedef unique_id function_id;
/* The task ID is a deterministic hash of the function ID that
* the task executes and the argument IDs or argument values */
typedef unique_id task_id;
/* The task instance ID is a globally unique ID generated which
* identifies this particular execution of the task */
typedef unique_id task_iid;
/* The node id is an identifier for the node the task is
* scheduled on */
typedef unique_id node_id;
/*
* TASK SPECIFICATIONS: Contain all the information neccessary
* to execute the task (function id, arguments, return object ids).
*
*/
typedef struct task_spec_impl task_spec;
/* If argument is passed by value or reference. */
enum arg_type { ARG_BY_REF, ARG_BY_VAL };
/* Construct and modify task specifications. */
/* Allocating and initializing a task. */
task_spec *alloc_task_spec(function_id function_id,
int64_t num_args,
int64_t num_returns,
int64_t args_value_size);
/* Size of the task in bytes. */
int64_t task_size(task_spec *spec);
/* Return the function ID of the task. */
unique_id *task_function(task_spec *spec);
/* Getting the number of arguments and returns. */
int64_t task_num_args(task_spec *spec);
int64_t task_num_returns(task_spec *spec);
/* Getting task arguments. */
int8_t task_arg_type(task_spec *spec, int64_t arg_index);
unique_id *task_arg_id(task_spec *spec, int64_t arg_index);
uint8_t *task_arg_val(task_spec *spec, int64_t arg_index);
int64_t task_arg_length(task_spec *spec, int64_t arg_index);
/* Setting task arguments. Note that this API only allows you to set the
* arguments in their order of appearance. */
int64_t task_args_add_ref(task_spec *spec, object_id obj_id);
int64_t task_args_add_val(task_spec *spec, uint8_t *data, int64_t length);
/* Getting and setting return arguments. Tasks return by reference for now. */
unique_id *task_return(task_spec *spec, int64_t ret_index);
/* Freeing the task datastructure. */
void free_task_spec(task_spec *spec);
/* Write the task specification to a file or socket. */
void write_task(int fd, task_spec *spec);
/* Read the task specification from a file or socket. It is the user's
* responsibility to free the task after it has been used. */
task_spec *read_task(int fd);
/* Print task as a humanly readable string. */
void print_task(task_spec *spec, UT_string *output);
/*
* SCHEDULED TASK: Contains information about a scheduled task:
* the task iid, the task specification and the task status
* (WAITING, SCHEDULED, RUNNING, DONE) and which node the
* task is scheduled on.
*
*/
/* The scheduling_state can be used as a flag when we are listening
* for an event, for example TASK_WAITING | TASK_SCHEDULED. */
enum scheduling_state {
TASK_STATUS_WAITING = 1,
TASK_STATUS_SCHEDULED = 2,
TASK_STATUS_RUNNING = 4,
TASK_STATUS_DONE = 8
};
/* A task instance is one execution of a task specification.
* It has a unique instance id, a state of execution (see scheduling_state)
* and a node it is scheduled on or running on. */
typedef struct task_instance_impl task_instance;
/* Allocate and initialize a new task instance. Must be freed with
* scheduled_task_free after use. */
task_instance *make_task_instance(task_iid task_iid,
task_spec *task,
int32_t state,
node_id node);
/* Size of task instance structure in bytes. */
int64_t task_instance_size(task_instance *instance);
/* Instance ID of the task instance. */
task_iid *task_instance_id(task_instance *instance);
/* The scheduling state of the task instance. */
int32_t *task_instance_state(task_instance *instance);
/* Node this task instance has been assigned to or is running on. */
node_id *task_instance_node(task_instance *instance);
/* Task specification of this task instance. */
task_spec *task_instance_task_spec(task_instance *instance);
/* Free this task instance datastructure. */
void task_instance_free(task_instance *instance);
#endif
+24
View File
@@ -0,0 +1,24 @@
#include "greatest.h"
#include "common.h"
SUITE(common_tests);
TEST sha1_test(void) {
static char hex[2 * UNIQUE_ID_SIZE + 1];
unique_id uid = globally_unique_id();
sha1_to_hex(&uid.id[0], &hex[0]);
PASS();
}
SUITE(common_tests) {
RUN_TEST(sha1_test);
}
GREATEST_MAIN_DEFS();
int main(int argc, char **argv) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(common_tests);
GREATEST_MAIN_END();
}
+175
View File
@@ -0,0 +1,175 @@
#include "greatest.h"
#include <assert.h>
#include <unistd.h>
#include <sys/wait.h>
#include "event_loop.h"
#include "test/example_task.h"
#include "state/db.h"
#include "state/object_table.h"
#include "state/task_log.h"
#include "state/redis.h"
#include "task.h"
SUITE(db_tests);
const char *manager_addr = "127.0.0.1";
int manager_port1 = 12345;
int manager_port2 = 12346;
char received_addr1[16] = {0};
char received_port1[6] = {0};
char received_addr2[16] = {0};
char received_port2[6] = {0};
/* Test if entries have been written to the database. */
void test_callback(object_id object_id,
int manager_count,
const char *manager_vector[],
void *context) {
CHECK(manager_count == 2);
if (!manager_vector[0] ||
sscanf(manager_vector[0], "%15[0-9.]:%5[0-9]", received_addr1,
received_port1) != 2) {
CHECK(0);
}
if (!manager_vector[1] ||
sscanf(manager_vector[1], "%15[0-9.]:%5[0-9]", received_addr2,
received_port2) != 2) {
CHECK(0);
}
free(manager_vector);
}
int timeout_handler(event_loop *loop, timer_id timer_id, void *context) {
event_loop_stop(loop);
return EVENT_LOOP_TIMER_DONE;
}
TEST object_table_lookup_test(void) {
event_loop *loop = event_loop_create();
db_handle *db1 = db_connect("127.0.0.1", 6379, "plasma_manager", manager_addr,
manager_port1);
db_handle *db2 = db_connect("127.0.0.1", 6379, "plasma_manager", manager_addr,
manager_port2);
db_attach(db1, loop);
db_attach(db2, loop);
unique_id id = globally_unique_id();
object_table_add(db1, id);
object_table_add(db2, id);
event_loop_add_timer(loop, 100, timeout_handler, NULL);
event_loop_run(loop);
object_table_lookup(db1, id, test_callback, NULL);
event_loop_add_timer(loop, 100, timeout_handler, NULL);
event_loop_run(loop);
int port1 = atoi(received_port1);
int port2 = atoi(received_port2);
ASSERT_STR_EQ(&received_addr1[0], manager_addr);
ASSERT((port1 == manager_port1 && port2 == manager_port2) ||
(port2 == manager_port1 && port1 == manager_port2));
db_disconnect(db1);
db_disconnect(db2);
event_loop_destroy(loop);
PASS();
}
void task_log_test_callback(task_instance *instance, void *userdata) {
task_instance *other = userdata;
CHECK(*task_instance_state(instance) == TASK_STATUS_SCHEDULED);
CHECK(task_instance_size(instance) == task_instance_size(other));
CHECK(memcmp(instance, other, task_instance_size(instance)) == 0);
}
TEST task_log_test(void) {
event_loop *loop = event_loop_create();
db_handle *db = db_connect("127.0.0.1", 6379, "local_scheduler", "", -1);
db_attach(db, loop);
node_id node = globally_unique_id();
task_spec *task = example_task();
task_instance *instance = make_task_instance(globally_unique_id(), task,
TASK_STATUS_SCHEDULED, node);
task_log_register_callback(db, task_log_test_callback, node,
TASK_STATUS_SCHEDULED, instance);
task_log_add_task(db, instance);
event_loop_add_timer(loop, 100, timeout_handler, NULL);
event_loop_run(loop);
task_instance_free(instance);
free_task_spec(task);
db_disconnect(db);
event_loop_destroy(loop);
PASS();
}
int num_test_callback_called = 0;
void task_log_all_test_callback(task_instance *instance, void *userdata) {
num_test_callback_called += 1;
}
TEST task_log_all_test(void) {
event_loop *loop = event_loop_create();
db_handle *db = db_connect("127.0.0.1", 6379, "local_scheduler", "", -1);
db_attach(db, loop);
task_spec *task = example_task();
/* Schedule two tasks on different nodes. */
task_instance *instance1 = make_task_instance(
globally_unique_id(), task, TASK_STATUS_SCHEDULED, globally_unique_id());
task_instance *instance2 = make_task_instance(
globally_unique_id(), task, TASK_STATUS_SCHEDULED, globally_unique_id());
task_log_register_callback(db, task_log_all_test_callback, NIL_ID,
TASK_STATUS_SCHEDULED, NULL);
task_log_add_task(db, instance1);
task_log_add_task(db, instance2);
event_loop_add_timer(loop, 100, timeout_handler, NULL);
event_loop_run(loop);
task_instance_free(instance2);
task_instance_free(instance1);
free_task_spec(task);
db_disconnect(db);
event_loop_destroy(loop);
ASSERT(num_test_callback_called == 2);
PASS();
}
TEST unique_client_id_test(void) {
const int num_conns = 50;
db_handle *db;
pid_t pid = fork();
for (int i = 0; i < num_conns; ++i) {
db = db_connect("127.0.0.1", 6379, "plasma_manager", manager_addr,
manager_port1);
db_disconnect(db);
}
if (pid == 0) {
exit(0);
} else {
wait(NULL);
}
db = db_connect("127.0.0.1", 6379, "plasma_manager", manager_addr,
manager_port1);
ASSERT_EQ(get_client_id(db), num_conns * 2);
db_disconnect(db);
PASS();
}
SUITE(db_tests) {
redisContext *context = redisConnect("127.0.0.1", 6379);
freeReplyObject(redisCommand(context, "FLUSHALL"));
RUN_REDIS_TEST(context, object_table_lookup_test);
RUN_REDIS_TEST(context, task_log_test);
RUN_REDIS_TEST(context, task_log_all_test);
RUN_REDIS_TEST(context, unique_client_id_test);
redisFree(context);
}
GREATEST_MAIN_DEFS();
int main(int argc, char **argv) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(db_tests);
GREATEST_MAIN_END();
}
+14
View File
@@ -0,0 +1,14 @@
#ifndef EXAMPLE_TASK_H
#define EXAMPLE_TASK_H
#include "task.h"
task_spec *example_task(void) {
function_id func_id = globally_unique_id();
task_spec *task = alloc_task_spec(func_id, 2, 1, 0);
task_args_add_ref(task, globally_unique_id());
task_args_add_ref(task, globally_unique_id());
return task;
}
#endif
+106
View File
@@ -0,0 +1,106 @@
#include "greatest.h"
#include <assert.h>
#include <unistd.h>
#include <inttypes.h>
#include "io.h"
#include "utstring.h"
SUITE(io_tests);
TEST ipc_socket_test(void) {
const char *socket_pathname = "test-socket";
int socket_fd = bind_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
char *test_string = "hello world";
char *test_bytes = "another string";
pid_t pid = fork();
if (pid == 0) {
close(socket_fd);
socket_fd = connect_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
write_log_message(socket_fd, test_string);
write_message(socket_fd, LOG_MESSAGE, strlen(test_bytes),
(uint8_t *) test_bytes);
close(socket_fd);
exit(0);
} else {
int client_fd = accept_client(socket_fd);
ASSERT(client_fd >= 0);
char *message = read_log_message(client_fd);
ASSERT(message != NULL);
ASSERT_STR_EQ(test_string, message);
free(message);
int64_t type;
int64_t len;
uint8_t *bytes;
read_message(client_fd, &type, &len, &bytes);
ASSERT(type == LOG_MESSAGE);
ASSERT(memcmp(test_bytes, bytes, len) == 0);
free(bytes);
close(client_fd);
close(socket_fd);
unlink(socket_pathname);
}
PASS();
}
TEST long_ipc_socket_test(void) {
const char *socket_pathname = "long-test-socket";
int socket_fd = bind_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
UT_string *test_string;
utstring_new(test_string);
for (int i = 0; i < 10000; i++) {
utstring_printf(test_string, "hello world ");
}
char *test_bytes = "another string";
pid_t pid = fork();
if (pid == 0) {
close(socket_fd);
socket_fd = connect_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
write_log_message(socket_fd, utstring_body(test_string));
write_message(socket_fd, LOG_MESSAGE, strlen(test_bytes),
(uint8_t *) test_bytes);
close(socket_fd);
exit(0);
} else {
int client_fd = accept_client(socket_fd);
ASSERT(client_fd >= 0);
char *message = read_log_message(client_fd);
ASSERT(message != NULL);
ASSERT_STR_EQ(utstring_body(test_string), message);
free(message);
int64_t type;
int64_t len;
uint8_t *bytes;
read_message(client_fd, &type, &len, &bytes);
ASSERT(type == LOG_MESSAGE);
ASSERT(memcmp(test_bytes, bytes, len) == 0);
free(bytes);
close(client_fd);
close(socket_fd);
unlink(socket_pathname);
}
utstring_free(test_string);
PASS();
}
SUITE(io_tests) {
RUN_TEST(ipc_socket_test);
RUN_TEST(long_ipc_socket_test);
}
GREATEST_MAIN_DEFS();
int main(int argc, char **argv) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(io_tests);
GREATEST_MAIN_END();
}
+225
View File
@@ -0,0 +1,225 @@
#include "greatest.h"
#include <assert.h>
#include <unistd.h>
#include "utarray.h"
#include "event_loop.h"
#include "state/db.h"
#include "state/redis.h"
#include "io.h"
#include "logging.h"
SUITE(redis_tests);
const char *test_set_format = "SET %s %s";
const char *test_get_format = "GET %s";
const char *test_key = "foo";
const char *test_value = "bar";
UT_array *connections = NULL;
int async_redis_socket_test_callback_called = 0;
void async_redis_socket_test_callback(redisAsyncContext *ac,
void *r,
void *privdata) {
async_redis_socket_test_callback_called = 1;
redisContext *context = redisConnect("127.0.0.1", 6379);
redisReply *reply = redisCommand(context, test_get_format, test_key);
redisFree(context);
CHECK(reply != NULL);
if (strcmp(reply->str, test_value)) {
freeReplyObject(reply);
CHECK(0);
}
freeReplyObject(reply);
}
TEST redis_socket_test(void) {
const char *socket_pathname = "redis-test-socket";
redisContext *context = redisConnect("127.0.0.1", 6379);
ASSERT(context != NULL);
int socket_fd = bind_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
int client_fd = connect_ipc_sock(socket_pathname);
ASSERT(client_fd >= 0);
write_formatted_log_message(client_fd, test_set_format, test_key, test_value);
int server_fd = accept_client(socket_fd);
char *cmd = read_log_message(server_fd);
close(client_fd);
close(server_fd);
close(socket_fd);
unlink(socket_pathname);
redisReply *reply;
reply = redisCommand(context, cmd, 0, 0);
freeReplyObject(reply);
reply = redisCommand(context, "GET %s", test_key);
ASSERT(reply != NULL);
ASSERT_STR_EQ(reply->str, test_value);
freeReplyObject(reply);
free(cmd);
redisFree(context);
PASS();
}
void redis_read_callback(event_loop *loop, int fd, void *context, int events) {
db_handle *db = context;
char *cmd = read_log_message(fd);
redisAsyncCommand(db->context, async_redis_socket_test_callback, NULL, cmd,
db->client_id, 0);
free(cmd);
}
void redis_accept_callback(event_loop *loop,
int socket_fd,
void *context,
int events) {
int accept_fd = accept_client(socket_fd);
CHECK(accept_fd >= 0);
utarray_push_back(connections, &accept_fd);
event_loop_add_file(loop, accept_fd, EVENT_LOOP_READ, redis_read_callback,
context);
}
int timeout_handler(event_loop *loop, timer_id timer_id, void *context) {
event_loop_stop(loop);
return EVENT_LOOP_TIMER_DONE;
}
TEST async_redis_socket_test(void) {
utarray_new(connections, &ut_int_icd);
event_loop *loop = event_loop_create();
/* Start IPC channel. */
const char *socket_pathname = "async-redis-test-socket";
int socket_fd = bind_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
utarray_push_back(connections, &socket_fd);
/* Start connection to Redis. */
db_handle *db = db_connect("127.0.0.1", 6379, "", "", 0);
db_attach(db, loop);
/* Send a command to the Redis process. */
int client_fd = connect_ipc_sock(socket_pathname);
ASSERT(client_fd >= 0);
utarray_push_back(connections, &client_fd);
write_formatted_log_message(client_fd, test_set_format, test_key, test_value);
event_loop_add_file(loop, client_fd, EVENT_LOOP_READ, redis_read_callback,
db);
event_loop_add_file(loop, socket_fd, EVENT_LOOP_READ, redis_accept_callback,
db);
event_loop_add_timer(loop, 100, timeout_handler, NULL);
event_loop_run(loop);
CHECK(async_redis_socket_test_callback_called);
db_disconnect(db);
event_loop_destroy(loop);
for (int *p = (int *) utarray_front(connections); p != NULL;
p = (int *) utarray_next(connections, p)) {
close(*p);
}
unlink(socket_pathname);
utarray_free(connections);
PASS();
}
int logging_test_callback_called = 0;
void logging_test_callback(redisAsyncContext *ac, void *r, void *privdata) {
logging_test_callback_called = 1;
redisContext *context = redisConnect("127.0.0.1", 6379);
redisReply *reply = redisCommand(context, "KEYS %s", "log:*");
redisFree(context);
CHECK(reply != NULL);
CHECK(reply->elements > 0);
freeReplyObject(reply);
}
void logging_read_callback(event_loop *loop,
int fd,
void *context,
int events) {
db_handle *conn = context;
char *cmd = read_log_message(fd);
redisAsyncCommand(conn->context, logging_test_callback, NULL, cmd,
conn->client_id, 0);
free(cmd);
}
void logging_accept_callback(event_loop *loop,
int socket_fd,
void *context,
int events) {
int accept_fd = accept_client(socket_fd);
CHECK(accept_fd >= 0);
utarray_push_back(connections, &accept_fd);
event_loop_add_file(loop, accept_fd, EVENT_LOOP_READ, logging_read_callback,
context);
}
TEST logging_test(void) {
utarray_new(connections, &ut_int_icd);
event_loop *loop = event_loop_create();
/* Start IPC channel. */
const char *socket_pathname = "logging-test-socket";
int socket_fd = bind_ipc_sock(socket_pathname);
ASSERT(socket_fd >= 0);
utarray_push_back(connections, &socket_fd);
/* Start connection to Redis. */
db_handle *conn = db_connect("127.0.0.1", 6379, "", "", 0);
db_attach(conn, loop);
/* Send a command to the Redis process. */
int client_fd = connect_ipc_sock(socket_pathname);
ASSERT(client_fd >= 0);
utarray_push_back(connections, &client_fd);
ray_logger *logger = init_ray_logger("worker", RAY_INFO, 0, &client_fd);
ray_log(logger, RAY_INFO, "TEST", "Message");
event_loop_add_file(loop, socket_fd, EVENT_LOOP_READ, logging_accept_callback,
conn);
event_loop_add_file(loop, client_fd, EVENT_LOOP_READ, logging_read_callback,
conn);
event_loop_add_timer(loop, 100, timeout_handler, NULL);
event_loop_run(loop);
CHECK(logging_test_callback_called);
free_ray_logger(logger);
db_disconnect(conn);
event_loop_destroy(loop);
for (int *p = (int *) utarray_front(connections); p != NULL;
p = (int *) utarray_next(connections, p)) {
close(*p);
}
unlink(socket_pathname);
utarray_free(connections);
PASS();
}
SUITE(redis_tests) {
redisContext *context = redisConnect("127.0.0.1", 6379);
freeReplyObject(redisCommand(context, "FLUSHALL"));
RUN_REDIS_TEST(context, redis_socket_test);
RUN_REDIS_TEST(context, async_redis_socket_test);
RUN_REDIS_TEST(context, logging_test);
redisFree(context);
}
GREATEST_MAIN_DEFS();
int main(int argc, char **argv) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(redis_tests);
GREATEST_MAIN_END();
}
+77
View File
@@ -0,0 +1,77 @@
#include "greatest.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include "common.h"
#include "test/example_task.h"
#include "task.h"
#include "io.h"
SUITE(task_tests);
TEST task_test(void) {
function_id func_id = globally_unique_id();
task_spec *task = alloc_task_spec(func_id, 4, 2, 10);
ASSERT(task_num_args(task) == 4);
ASSERT(task_num_returns(task) == 2);
unique_id arg1 = globally_unique_id();
ASSERT(task_args_add_ref(task, arg1) == 0);
ASSERT(task_args_add_val(task, (uint8_t *) "hello", 5) == 1);
unique_id arg2 = globally_unique_id();
ASSERT(task_args_add_ref(task, arg2) == 2);
ASSERT(task_args_add_val(task, (uint8_t *) "world", 5) == 3);
unique_id ret0 = globally_unique_id();
unique_id ret1 = globally_unique_id();
memcpy(task_return(task, 0), &ret0, sizeof(ret0));
memcpy(task_return(task, 1), &ret1, sizeof(ret1));
ASSERT(memcmp(task_arg_id(task, 0), &arg1, sizeof(arg1)) == 0);
ASSERT(memcmp(task_arg_val(task, 1), (uint8_t *) "hello",
task_arg_length(task, 1)) == 0);
ASSERT(memcmp(task_arg_id(task, 2), &arg2, sizeof(arg2)) == 0);
ASSERT(memcmp(task_arg_val(task, 3), (uint8_t *) "world",
task_arg_length(task, 3)) == 0);
ASSERT(memcmp(task_return(task, 0), &ret0, sizeof(unique_id)) == 0);
ASSERT(memcmp(task_return(task, 1), &ret1, sizeof(unique_id)) == 0);
free_task_spec(task);
PASS();
}
TEST send_task(void) {
function_id func_id = globally_unique_id();
task_spec *task = alloc_task_spec(func_id, 4, 2, 10);
*task_return(task, 1) = globally_unique_id();
int fd[2];
socketpair(AF_UNIX, SOCK_STREAM, 0, fd);
write_message(fd[0], SUBMIT_TASK, task_size(task), (uint8_t *) task);
int64_t type;
int64_t length;
uint8_t *message;
read_message(fd[1], &type, &length, &message);
task_spec *result = (task_spec *) message;
ASSERT(type == SUBMIT_TASK);
ASSERT(memcmp(task, result, task_size(task)) == 0);
ASSERT(memcmp(task, result, task_size(result)) == 0);
free(task);
free(result);
PASS();
}
SUITE(task_tests) {
RUN_TEST(task_test);
RUN_TEST(send_task);
}
GREATEST_MAIN_DEFS();
int main(int argc, char **argv) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(task_tests);
GREATEST_MAIN_END();
}
+119
View File
@@ -0,0 +1,119 @@
from __future__ import print_function
import pickle
import unittest
import common
BASE_SIMPLE_OBJECTS = [
0, 1, 100000, 0L, 1L, 100000L, 1L << 100, 0.0, 0.5, 0.9, 100000.1, (), [], {},
"", 990 * "h", u"", 990 * u"h"
]
LIST_SIMPLE_OBJECTS = [[obj] for obj in BASE_SIMPLE_OBJECTS]
TUPLE_SIMPLE_OBJECTS = [(obj,) for obj in BASE_SIMPLE_OBJECTS]
DICT_SIMPLE_OBJECTS = [{(): obj} for obj in BASE_SIMPLE_OBJECTS]
SIMPLE_OBJECTS = (BASE_SIMPLE_OBJECTS +
LIST_SIMPLE_OBJECTS +
TUPLE_SIMPLE_OBJECTS +
DICT_SIMPLE_OBJECTS)
# Create some complex objects that cannot be serialized by value in tasks.
l = []
l.append(l)
class Foo(object):
def __init__(self):
pass
BASE_COMPLEX_OBJECTS = [999 * "h", 999 * u"h", l, Foo(), 10 * [10 * [10 * [1]]]]
LIST_COMPLEX_OBJECTS = [[obj] for obj in BASE_COMPLEX_OBJECTS]
TUPLE_COMPLEX_OBJECTS = [(obj,) for obj in BASE_COMPLEX_OBJECTS]
DICT_COMPLEX_OBJECTS = [{(): obj} for obj in BASE_COMPLEX_OBJECTS]
COMPLEX_OBJECTS = (BASE_COMPLEX_OBJECTS +
LIST_COMPLEX_OBJECTS +
TUPLE_COMPLEX_OBJECTS +
DICT_COMPLEX_OBJECTS)
class TestSerialization(unittest.TestCase):
def test_serialize_by_value(self):
for val in SIMPLE_OBJECTS:
self.assertTrue(common.check_simple_value(val))
for val in COMPLEX_OBJECTS:
self.assertFalse(common.check_simple_value(val))
class TestObjectID(unittest.TestCase):
def test_create_object_id(self):
object_id = common.ObjectID(20 * "a")
def test_cannot_pickle_object_ids(self):
object_ids = [common.ObjectID(20 * chr(i)) for i in range(256)]
def f():
return object_ids
def g(val=object_ids):
return 1
def h():
x = object_ids[0]
return 1
# Make sure that object IDs cannot be pickled (including functions that
# close over object IDs).
self.assertRaises(Exception, lambda : pickling.dumps(object_ids[0]))
self.assertRaises(Exception, lambda : pickling.dumps(object_ids))
self.assertRaises(Exception, lambda : pickling.dumps(f))
self.assertRaises(Exception, lambda : pickling.dumps(g))
self.assertRaises(Exception, lambda : pickling.dumps(h))
class TestTask(unittest.TestCase):
def test_create_task(self):
# TODO(rkn): The function ID should be a FunctionID object, not an ObjectID.
function_id = common.ObjectID(20 * "a")
object_ids = [common.ObjectID(20 * chr(i)) for i in range(256)]
args_list = [
[],
1 * [1],
10 * [1],
100 * [1],
1000 * [1],
1 * ["a"],
10 * ["a"],
100 * ["a"],
1000 * ["a"],
[1, 1.3, 2L, 1L << 100, "hi", u"hi", [1, 2]],
object_ids[:1],
object_ids[:2],
object_ids[:3],
object_ids[:4],
object_ids[:5],
object_ids[:10],
object_ids[:100],
object_ids[:256],
[1, object_ids[0]],
[object_ids[0], "a"],
[1, object_ids[0], "a"],
[object_ids[0], 1, object_ids[1], "a"],
object_ids[:3] + [1, "hi", 2.3] + object_ids[:5],
object_ids + 100 * ["a"] + object_ids
]
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
task = common.Task(function_id, args, num_return_vals)
self.assertEqual(function_id.id(), task.function_id().id())
retrieved_args = task.arguments()
self.assertEqual(num_return_vals, len(task.returns()))
self.assertEqual(len(args), len(retrieved_args))
for i in range(len(retrieved_args)):
if isinstance(retrieved_args[i], common.ObjectID):
self.assertEqual(retrieved_args[i].id(), args[i].id())
else:
self.assertEqual(retrieved_args[i], args[i])
if __name__ == "__main__":
unittest.main(verbosity=2)
+465
View File
@@ -0,0 +1,465 @@
/* A simple event-driven programming library. Originally I wrote this code
* for the Jim's event-loop (Jim is a Tcl interpreter) but later translated
* it in form of a library for easy reuse.
*
* Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>
#include <poll.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include "ae.h"
#include "zmalloc.h"
#include "config.h"
/* Include the best multiplexing layer supported by this system.
* The following should be ordered by performances, descending. */
#ifdef HAVE_EVPORT
#include "ae_evport.c"
#else
#ifdef HAVE_EPOLL
#include "ae_epoll.c"
#else
#ifdef HAVE_KQUEUE
#include "ae_kqueue.c"
#else
#include "ae_select.c"
#endif
#endif
#endif
aeEventLoop *aeCreateEventLoop(int setsize) {
aeEventLoop *eventLoop;
int i;
if ((eventLoop = zmalloc(sizeof(*eventLoop))) == NULL) goto err;
eventLoop->events = zmalloc(sizeof(aeFileEvent)*setsize);
eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*setsize);
if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err;
eventLoop->setsize = setsize;
eventLoop->lastTime = time(NULL);
eventLoop->timeEventHead = NULL;
eventLoop->timeEventNextId = 0;
eventLoop->stop = 0;
eventLoop->maxfd = -1;
eventLoop->beforesleep = NULL;
if (aeApiCreate(eventLoop) == -1) goto err;
/* Events with mask == AE_NONE are not set. So let's initialize the
* vector with it. */
for (i = 0; i < setsize; i++)
eventLoop->events[i].mask = AE_NONE;
return eventLoop;
err:
if (eventLoop) {
zfree(eventLoop->events);
zfree(eventLoop->fired);
zfree(eventLoop);
}
return NULL;
}
/* Return the current set size. */
int aeGetSetSize(aeEventLoop *eventLoop) {
return eventLoop->setsize;
}
/* Resize the maximum set size of the event loop.
* If the requested set size is smaller than the current set size, but
* there is already a file descriptor in use that is >= the requested
* set size minus one, AE_ERR is returned and the operation is not
* performed at all.
*
* Otherwise AE_OK is returned and the operation is successful. */
int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) {
int i;
if (setsize == eventLoop->setsize) return AE_OK;
if (eventLoop->maxfd >= setsize) return AE_ERR;
if (aeApiResize(eventLoop,setsize) == -1) return AE_ERR;
eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize);
eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize);
eventLoop->setsize = setsize;
/* Make sure that if we created new slots, they are initialized with
* an AE_NONE mask. */
for (i = eventLoop->maxfd+1; i < setsize; i++)
eventLoop->events[i].mask = AE_NONE;
return AE_OK;
}
void aeDeleteEventLoop(aeEventLoop *eventLoop) {
aeApiFree(eventLoop);
zfree(eventLoop->events);
zfree(eventLoop->fired);
zfree(eventLoop);
}
void aeStop(aeEventLoop *eventLoop) {
eventLoop->stop = 1;
}
int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
aeFileProc *proc, void *clientData)
{
if (fd >= eventLoop->setsize) {
errno = ERANGE;
return AE_ERR;
}
aeFileEvent *fe = &eventLoop->events[fd];
if (aeApiAddEvent(eventLoop, fd, mask) == -1)
return AE_ERR;
fe->mask |= mask;
if (mask & AE_READABLE) fe->rfileProc = proc;
if (mask & AE_WRITABLE) fe->wfileProc = proc;
fe->clientData = clientData;
if (fd > eventLoop->maxfd)
eventLoop->maxfd = fd;
return AE_OK;
}
void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
{
if (fd >= eventLoop->setsize) return;
aeFileEvent *fe = &eventLoop->events[fd];
if (fe->mask == AE_NONE) return;
aeApiDelEvent(eventLoop, fd, mask);
fe->mask = fe->mask & (~mask);
if (fd == eventLoop->maxfd && fe->mask == AE_NONE) {
/* Update the max fd */
int j;
for (j = eventLoop->maxfd-1; j >= 0; j--)
if (eventLoop->events[j].mask != AE_NONE) break;
eventLoop->maxfd = j;
}
}
int aeGetFileEvents(aeEventLoop *eventLoop, int fd) {
if (fd >= eventLoop->setsize) return 0;
aeFileEvent *fe = &eventLoop->events[fd];
return fe->mask;
}
static void aeGetTime(long *seconds, long *milliseconds)
{
struct timeval tv;
gettimeofday(&tv, NULL);
*seconds = tv.tv_sec;
*milliseconds = tv.tv_usec/1000;
}
static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) {
long cur_sec, cur_ms, when_sec, when_ms;
aeGetTime(&cur_sec, &cur_ms);
when_sec = cur_sec + milliseconds/1000;
when_ms = cur_ms + milliseconds%1000;
if (when_ms >= 1000) {
when_sec ++;
when_ms -= 1000;
}
*sec = when_sec;
*ms = when_ms;
}
long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
aeTimeProc *proc, void *clientData,
aeEventFinalizerProc *finalizerProc)
{
long long id = eventLoop->timeEventNextId++;
aeTimeEvent *te;
te = zmalloc(sizeof(*te));
if (te == NULL) return AE_ERR;
te->id = id;
aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms);
te->timeProc = proc;
te->finalizerProc = finalizerProc;
te->clientData = clientData;
te->next = eventLoop->timeEventHead;
eventLoop->timeEventHead = te;
return id;
}
int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id)
{
aeTimeEvent *te = eventLoop->timeEventHead;
while(te) {
if (te->id == id) {
te->id = AE_DELETED_EVENT_ID;
return AE_OK;
}
te = te->next;
}
return AE_ERR; /* NO event with the specified ID found */
}
/* Search the first timer to fire.
* This operation is useful to know how many time the select can be
* put in sleep without to delay any event.
* If there are no timers NULL is returned.
*
* Note that's O(N) since time events are unsorted.
* Possible optimizations (not needed by Redis so far, but...):
* 1) Insert the event in order, so that the nearest is just the head.
* Much better but still insertion or deletion of timers is O(N).
* 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)).
*/
static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop)
{
aeTimeEvent *te = eventLoop->timeEventHead;
aeTimeEvent *nearest = NULL;
while(te) {
if (!nearest || te->when_sec < nearest->when_sec ||
(te->when_sec == nearest->when_sec &&
te->when_ms < nearest->when_ms))
nearest = te;
te = te->next;
}
return nearest;
}
/* Process time events */
static int processTimeEvents(aeEventLoop *eventLoop) {
int processed = 0;
aeTimeEvent *te, *prev;
long long maxId;
time_t now = time(NULL);
/* If the system clock is moved to the future, and then set back to the
* right value, time events may be delayed in a random way. Often this
* means that scheduled operations will not be performed soon enough.
*
* Here we try to detect system clock skews, and force all the time
* events to be processed ASAP when this happens: the idea is that
* processing events earlier is less dangerous than delaying them
* indefinitely, and practice suggests it is. */
if (now < eventLoop->lastTime) {
te = eventLoop->timeEventHead;
while(te) {
te->when_sec = 0;
te = te->next;
}
}
eventLoop->lastTime = now;
prev = NULL;
te = eventLoop->timeEventHead;
maxId = eventLoop->timeEventNextId-1;
while(te) {
long now_sec, now_ms;
long long id;
/* Remove events scheduled for deletion. */
if (te->id == AE_DELETED_EVENT_ID) {
aeTimeEvent *next = te->next;
if (prev == NULL)
eventLoop->timeEventHead = te->next;
else
prev->next = te->next;
if (te->finalizerProc)
te->finalizerProc(eventLoop, te->clientData);
zfree(te);
te = next;
continue;
}
/* Make sure we don't process time events created by time events in
* this iteration. Note that this check is currently useless: we always
* add new timers on the head, however if we change the implementation
* detail, this check may be useful again: we keep it here for future
* defense. */
if (te->id > maxId) {
te = te->next;
continue;
}
aeGetTime(&now_sec, &now_ms);
if (now_sec > te->when_sec ||
(now_sec == te->when_sec && now_ms >= te->when_ms))
{
int retval;
id = te->id;
retval = te->timeProc(eventLoop, id, te->clientData);
processed++;
if (retval != AE_NOMORE) {
aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms);
} else {
te->id = AE_DELETED_EVENT_ID;
}
}
prev = te;
te = te->next;
}
return processed;
}
/* Process every pending time event, then every pending file event
* (that may be registered by time event callbacks just processed).
* Without special flags the function sleeps until some file event
* fires, or when the next time event occurs (if any).
*
* If flags is 0, the function does nothing and returns.
* if flags has AE_ALL_EVENTS set, all the kind of events are processed.
* if flags has AE_FILE_EVENTS set, file events are processed.
* if flags has AE_TIME_EVENTS set, time events are processed.
* if flags has AE_DONT_WAIT set the function returns ASAP until all
* the events that's possible to process without to wait are processed.
*
* The function returns the number of events processed. */
int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
int processed = 0, numevents;
/* Nothing to do? return ASAP */
if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0;
/* Note that we want call select() even if there are no
* file events to process as long as we want to process time
* events, in order to sleep until the next time event is ready
* to fire. */
if (eventLoop->maxfd != -1 ||
((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {
int j;
aeTimeEvent *shortest = NULL;
struct timeval tv, *tvp;
if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))
shortest = aeSearchNearestTimer(eventLoop);
if (shortest) {
long now_sec, now_ms;
aeGetTime(&now_sec, &now_ms);
tvp = &tv;
/* How many milliseconds we need to wait for the next
* time event to fire? */
long long ms =
(shortest->when_sec - now_sec)*1000 +
shortest->when_ms - now_ms;
if (ms > 0) {
tvp->tv_sec = ms/1000;
tvp->tv_usec = (ms % 1000)*1000;
} else {
tvp->tv_sec = 0;
tvp->tv_usec = 0;
}
} else {
/* If we have to check for events but need to return
* ASAP because of AE_DONT_WAIT we need to set the timeout
* to zero */
if (flags & AE_DONT_WAIT) {
tv.tv_sec = tv.tv_usec = 0;
tvp = &tv;
} else {
/* Otherwise we can block */
tvp = NULL; /* wait forever */
}
}
numevents = aeApiPoll(eventLoop, tvp);
for (j = 0; j < numevents; j++) {
aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
int mask = eventLoop->fired[j].mask;
int fd = eventLoop->fired[j].fd;
int rfired = 0;
/* note the fe->mask & mask & ... code: maybe an already processed
* event removed an element that fired and we still didn't
* processed, so we check if the event is still valid. */
if (fe->mask & mask & AE_READABLE) {
rfired = 1;
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
}
if (fe->mask & mask & AE_WRITABLE) {
if (!rfired || fe->wfileProc != fe->rfileProc)
fe->wfileProc(eventLoop,fd,fe->clientData,mask);
}
processed++;
}
}
/* Check time events */
if (flags & AE_TIME_EVENTS)
processed += processTimeEvents(eventLoop);
return processed; /* return the number of processed file/time events */
}
/* Wait for milliseconds until the given file descriptor becomes
* writable/readable/exception */
int aeWait(int fd, int mask, long long milliseconds) {
struct pollfd pfd;
int retmask = 0, retval;
memset(&pfd, 0, sizeof(pfd));
pfd.fd = fd;
if (mask & AE_READABLE) pfd.events |= POLLIN;
if (mask & AE_WRITABLE) pfd.events |= POLLOUT;
if ((retval = poll(&pfd, 1, milliseconds))== 1) {
if (pfd.revents & POLLIN) retmask |= AE_READABLE;
if (pfd.revents & POLLOUT) retmask |= AE_WRITABLE;
if (pfd.revents & POLLERR) retmask |= AE_WRITABLE;
if (pfd.revents & POLLHUP) retmask |= AE_WRITABLE;
return retmask;
} else {
return retval;
}
}
void aeMain(aeEventLoop *eventLoop) {
eventLoop->stop = 0;
while (!eventLoop->stop) {
if (eventLoop->beforesleep != NULL)
eventLoop->beforesleep(eventLoop);
aeProcessEvents(eventLoop, AE_ALL_EVENTS);
}
}
char *aeGetApiName(void) {
return aeApiName();
}
void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) {
eventLoop->beforesleep = beforesleep;
}
+123
View File
@@ -0,0 +1,123 @@
/* A simple event-driven programming library. Originally I wrote this code
* for the Jim's event-loop (Jim is a Tcl interpreter) but later translated
* it in form of a library for easy reuse.
*
* Copyright (c) 2006-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __AE_H__
#define __AE_H__
#include <time.h>
#define AE_OK 0
#define AE_ERR -1
#define AE_NONE 0
#define AE_READABLE 1
#define AE_WRITABLE 2
#define AE_FILE_EVENTS 1
#define AE_TIME_EVENTS 2
#define AE_ALL_EVENTS (AE_FILE_EVENTS|AE_TIME_EVENTS)
#define AE_DONT_WAIT 4
#define AE_NOMORE -1
#define AE_DELETED_EVENT_ID -1
/* Macros */
#define AE_NOTUSED(V) ((void) V)
struct aeEventLoop;
/* Types and data structures */
typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask);
typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData);
typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData);
typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop);
/* File event structure */
typedef struct aeFileEvent {
int mask; /* one of AE_(READABLE|WRITABLE) */
aeFileProc *rfileProc;
aeFileProc *wfileProc;
void *clientData;
} aeFileEvent;
/* Time event structure */
typedef struct aeTimeEvent {
long long id; /* time event identifier. */
long when_sec; /* seconds */
long when_ms; /* milliseconds */
aeTimeProc *timeProc;
aeEventFinalizerProc *finalizerProc;
void *clientData;
struct aeTimeEvent *next;
} aeTimeEvent;
/* A fired event */
typedef struct aeFiredEvent {
int fd;
int mask;
} aeFiredEvent;
/* State of an event based program */
typedef struct aeEventLoop {
int maxfd; /* highest file descriptor currently registered */
int setsize; /* max number of file descriptors tracked */
long long timeEventNextId;
time_t lastTime; /* Used to detect system clock skew */
aeFileEvent *events; /* Registered events */
aeFiredEvent *fired; /* Fired events */
aeTimeEvent *timeEventHead;
int stop;
void *apidata; /* This is used for polling API specific data */
aeBeforeSleepProc *beforesleep;
} aeEventLoop;
/* Prototypes */
aeEventLoop *aeCreateEventLoop(int setsize);
void aeDeleteEventLoop(aeEventLoop *eventLoop);
void aeStop(aeEventLoop *eventLoop);
int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
aeFileProc *proc, void *clientData);
void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask);
int aeGetFileEvents(aeEventLoop *eventLoop, int fd);
long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
aeTimeProc *proc, void *clientData,
aeEventFinalizerProc *finalizerProc);
int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id);
int aeProcessEvents(aeEventLoop *eventLoop, int flags);
int aeWait(int fd, int mask, long long milliseconds);
void aeMain(aeEventLoop *eventLoop);
char *aeGetApiName(void);
void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep);
int aeGetSetSize(aeEventLoop *eventLoop);
int aeResizeSetSize(aeEventLoop *eventLoop, int setsize);
#endif
+135
View File
@@ -0,0 +1,135 @@
/* Linux epoll(2) based ae.c module
*
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/epoll.h>
typedef struct aeApiState {
int epfd;
struct epoll_event *events;
} aeApiState;
static int aeApiCreate(aeEventLoop *eventLoop) {
aeApiState *state = zmalloc(sizeof(aeApiState));
if (!state) return -1;
state->events = zmalloc(sizeof(struct epoll_event)*eventLoop->setsize);
if (!state->events) {
zfree(state);
return -1;
}
state->epfd = epoll_create(1024); /* 1024 is just a hint for the kernel */
if (state->epfd == -1) {
zfree(state->events);
zfree(state);
return -1;
}
eventLoop->apidata = state;
return 0;
}
static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
aeApiState *state = eventLoop->apidata;
state->events = zrealloc(state->events, sizeof(struct epoll_event)*setsize);
return 0;
}
static void aeApiFree(aeEventLoop *eventLoop) {
aeApiState *state = eventLoop->apidata;
close(state->epfd);
zfree(state->events);
zfree(state);
}
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct epoll_event ee = {0}; /* avoid valgrind warning */
/* If the fd was already monitored for some event, we need a MOD
* operation. Otherwise we need an ADD operation. */
int op = eventLoop->events[fd].mask == AE_NONE ?
EPOLL_CTL_ADD : EPOLL_CTL_MOD;
ee.events = 0;
mask |= eventLoop->events[fd].mask; /* Merge old events */
if (mask & AE_READABLE) ee.events |= EPOLLIN;
if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
ee.data.fd = fd;
if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
return 0;
}
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) {
aeApiState *state = eventLoop->apidata;
struct epoll_event ee = {0}; /* avoid valgrind warning */
int mask = eventLoop->events[fd].mask & (~delmask);
ee.events = 0;
if (mask & AE_READABLE) ee.events |= EPOLLIN;
if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
ee.data.fd = fd;
if (mask != AE_NONE) {
epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee);
} else {
/* Note, Kernel < 2.6.9 requires a non null event pointer even for
* EPOLL_CTL_DEL. */
epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee);
}
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, numevents = 0;
retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
if (retval > 0) {
int j;
numevents = retval;
for (j = 0; j < numevents; j++) {
int mask = 0;
struct epoll_event *e = state->events+j;
if (e->events & EPOLLIN) mask |= AE_READABLE;
if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
if (e->events & EPOLLERR) mask |= AE_WRITABLE;
if (e->events & EPOLLHUP) mask |= AE_WRITABLE;
eventLoop->fired[j].fd = e->data.fd;
eventLoop->fired[j].mask = mask;
}
}
return numevents;
}
static char *aeApiName(void) {
return "epoll";
}
+320
View File
@@ -0,0 +1,320 @@
/* ae.c module for illumos event ports.
*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
#include <errno.h>
#include <port.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
static int evport_debug = 0;
/*
* This file implements the ae API using event ports, present on Solaris-based
* systems since Solaris 10. Using the event port interface, we associate file
* descriptors with the port. Each association also includes the set of poll(2)
* events that the consumer is interested in (e.g., POLLIN and POLLOUT).
*
* There's one tricky piece to this implementation: when we return events via
* aeApiPoll, the corresponding file descriptors become dissociated from the
* port. This is necessary because poll events are level-triggered, so if the
* fd didn't become dissociated, it would immediately fire another event since
* the underlying state hasn't changed yet. We must re-associate the file
* descriptor, but only after we know that our caller has actually read from it.
* The ae API does not tell us exactly when that happens, but we do know that
* it must happen by the time aeApiPoll is called again. Our solution is to
* keep track of the last fds returned by aeApiPoll and re-associate them next
* time aeApiPoll is invoked.
*
* To summarize, in this module, each fd association is EITHER (a) represented
* only via the in-kernel association OR (b) represented by pending_fds and
* pending_masks. (b) is only true for the last fds we returned from aeApiPoll,
* and only until we enter aeApiPoll again (at which point we restore the
* in-kernel association).
*/
#define MAX_EVENT_BATCHSZ 512
typedef struct aeApiState {
int portfd; /* event port */
int npending; /* # of pending fds */
int pending_fds[MAX_EVENT_BATCHSZ]; /* pending fds */
int pending_masks[MAX_EVENT_BATCHSZ]; /* pending fds' masks */
} aeApiState;
static int aeApiCreate(aeEventLoop *eventLoop) {
int i;
aeApiState *state = zmalloc(sizeof(aeApiState));
if (!state) return -1;
state->portfd = port_create();
if (state->portfd == -1) {
zfree(state);
return -1;
}
state->npending = 0;
for (i = 0; i < MAX_EVENT_BATCHSZ; i++) {
state->pending_fds[i] = -1;
state->pending_masks[i] = AE_NONE;
}
eventLoop->apidata = state;
return 0;
}
static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
/* Nothing to resize here. */
return 0;
}
static void aeApiFree(aeEventLoop *eventLoop) {
aeApiState *state = eventLoop->apidata;
close(state->portfd);
zfree(state);
}
static int aeApiLookupPending(aeApiState *state, int fd) {
int i;
for (i = 0; i < state->npending; i++) {
if (state->pending_fds[i] == fd)
return (i);
}
return (-1);
}
/*
* Helper function to invoke port_associate for the given fd and mask.
*/
static int aeApiAssociate(const char *where, int portfd, int fd, int mask) {
int events = 0;
int rv, err;
if (mask & AE_READABLE)
events |= POLLIN;
if (mask & AE_WRITABLE)
events |= POLLOUT;
if (evport_debug)
fprintf(stderr, "%s: port_associate(%d, 0x%x) = ", where, fd, events);
rv = port_associate(portfd, PORT_SOURCE_FD, fd, events,
(void *)(uintptr_t)mask);
err = errno;
if (evport_debug)
fprintf(stderr, "%d (%s)\n", rv, rv == 0 ? "no error" : strerror(err));
if (rv == -1) {
fprintf(stderr, "%s: port_associate: %s\n", where, strerror(err));
if (err == EAGAIN)
fprintf(stderr, "aeApiAssociate: event port limit exceeded.");
}
return rv;
}
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
int fullmask, pfd;
if (evport_debug)
fprintf(stderr, "aeApiAddEvent: fd %d mask 0x%x\n", fd, mask);
/*
* Since port_associate's "events" argument replaces any existing events, we
* must be sure to include whatever events are already associated when
* we call port_associate() again.
*/
fullmask = mask | eventLoop->events[fd].mask;
pfd = aeApiLookupPending(state, fd);
if (pfd != -1) {
/*
* This fd was recently returned from aeApiPoll. It should be safe to
* assume that the consumer has processed that poll event, but we play
* it safer by simply updating pending_mask. The fd will be
* re-associated as usual when aeApiPoll is called again.
*/
if (evport_debug)
fprintf(stderr, "aeApiAddEvent: adding to pending fd %d\n", fd);
state->pending_masks[pfd] |= fullmask;
return 0;
}
return (aeApiAssociate("aeApiAddEvent", state->portfd, fd, fullmask));
}
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
int fullmask, pfd;
if (evport_debug)
fprintf(stderr, "del fd %d mask 0x%x\n", fd, mask);
pfd = aeApiLookupPending(state, fd);
if (pfd != -1) {
if (evport_debug)
fprintf(stderr, "deleting event from pending fd %d\n", fd);
/*
* This fd was just returned from aeApiPoll, so it's not currently
* associated with the port. All we need to do is update
* pending_mask appropriately.
*/
state->pending_masks[pfd] &= ~mask;
if (state->pending_masks[pfd] == AE_NONE)
state->pending_fds[pfd] = -1;
return;
}
/*
* The fd is currently associated with the port. Like with the add case
* above, we must look at the full mask for the file descriptor before
* updating that association. We don't have a good way of knowing what the
* events are without looking into the eventLoop state directly. We rely on
* the fact that our caller has already updated the mask in the eventLoop.
*/
fullmask = eventLoop->events[fd].mask;
if (fullmask == AE_NONE) {
/*
* We're removing *all* events, so use port_dissociate to remove the
* association completely. Failure here indicates a bug.
*/
if (evport_debug)
fprintf(stderr, "aeApiDelEvent: port_dissociate(%d)\n", fd);
if (port_dissociate(state->portfd, PORT_SOURCE_FD, fd) != 0) {
perror("aeApiDelEvent: port_dissociate");
abort(); /* will not return */
}
} else if (aeApiAssociate("aeApiDelEvent", state->portfd, fd,
fullmask) != 0) {
/*
* ENOMEM is a potentially transient condition, but the kernel won't
* generally return it unless things are really bad. EAGAIN indicates
* we've reached an resource limit, for which it doesn't make sense to
* retry (counter-intuitively). All other errors indicate a bug. In any
* of these cases, the best we can do is to abort.
*/
abort(); /* will not return */
}
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
struct timespec timeout, *tsp;
int mask, i;
uint_t nevents;
port_event_t event[MAX_EVENT_BATCHSZ];
/*
* If we've returned fd events before, we must re-associate them with the
* port now, before calling port_get(). See the block comment at the top of
* this file for an explanation of why.
*/
for (i = 0; i < state->npending; i++) {
if (state->pending_fds[i] == -1)
/* This fd has since been deleted. */
continue;
if (aeApiAssociate("aeApiPoll", state->portfd,
state->pending_fds[i], state->pending_masks[i]) != 0) {
/* See aeApiDelEvent for why this case is fatal. */
abort();
}
state->pending_masks[i] = AE_NONE;
state->pending_fds[i] = -1;
}
state->npending = 0;
if (tvp != NULL) {
timeout.tv_sec = tvp->tv_sec;
timeout.tv_nsec = tvp->tv_usec * 1000;
tsp = &timeout;
} else {
tsp = NULL;
}
/*
* port_getn can return with errno == ETIME having returned some events (!).
* So if we get ETIME, we check nevents, too.
*/
nevents = 1;
if (port_getn(state->portfd, event, MAX_EVENT_BATCHSZ, &nevents,
tsp) == -1 && (errno != ETIME || nevents == 0)) {
if (errno == ETIME || errno == EINTR)
return 0;
/* Any other error indicates a bug. */
perror("aeApiPoll: port_get");
abort();
}
state->npending = nevents;
for (i = 0; i < nevents; i++) {
mask = 0;
if (event[i].portev_events & POLLIN)
mask |= AE_READABLE;
if (event[i].portev_events & POLLOUT)
mask |= AE_WRITABLE;
eventLoop->fired[i].fd = event[i].portev_object;
eventLoop->fired[i].mask = mask;
if (evport_debug)
fprintf(stderr, "aeApiPoll: fd %d mask 0x%x\n",
(int)event[i].portev_object, mask);
state->pending_fds[i] = event[i].portev_object;
state->pending_masks[i] = (uintptr_t)event[i].portev_user;
}
return nevents;
}
static char *aeApiName(void) {
return "evport";
}
+138
View File
@@ -0,0 +1,138 @@
/* Kqueue(2)-based ae.c module
*
* Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
typedef struct aeApiState {
int kqfd;
struct kevent *events;
} aeApiState;
static int aeApiCreate(aeEventLoop *eventLoop) {
aeApiState *state = zmalloc(sizeof(aeApiState));
if (!state) return -1;
state->events = zmalloc(sizeof(struct kevent)*eventLoop->setsize);
if (!state->events) {
zfree(state);
return -1;
}
state->kqfd = kqueue();
if (state->kqfd == -1) {
zfree(state->events);
zfree(state);
return -1;
}
eventLoop->apidata = state;
return 0;
}
static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
aeApiState *state = eventLoop->apidata;
state->events = zrealloc(state->events, sizeof(struct kevent)*setsize);
return 0;
}
static void aeApiFree(aeEventLoop *eventLoop) {
aeApiState *state = eventLoop->apidata;
close(state->kqfd);
zfree(state->events);
zfree(state);
}
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct kevent ke;
if (mask & AE_READABLE) {
EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
}
if (mask & AE_WRITABLE) {
EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
}
return 0;
}
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct kevent ke;
if (mask & AE_READABLE) {
EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
}
if (mask & AE_WRITABLE) {
EV_SET(&ke, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
}
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, numevents = 0;
if (tvp != NULL) {
struct timespec timeout;
timeout.tv_sec = tvp->tv_sec;
timeout.tv_nsec = tvp->tv_usec * 1000;
retval = kevent(state->kqfd, NULL, 0, state->events, eventLoop->setsize,
&timeout);
} else {
retval = kevent(state->kqfd, NULL, 0, state->events, eventLoop->setsize,
NULL);
}
if (retval > 0) {
int j;
numevents = retval;
for(j = 0; j < numevents; j++) {
int mask = 0;
struct kevent *e = state->events+j;
if (e->filter == EVFILT_READ) mask |= AE_READABLE;
if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE;
eventLoop->fired[j].fd = e->ident;
eventLoop->fired[j].mask = mask;
}
}
return numevents;
}
static char *aeApiName(void) {
return "kqueue";
}
+106
View File
@@ -0,0 +1,106 @@
/* Select()-based ae.c module.
*
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/select.h>
#include <string.h>
typedef struct aeApiState {
fd_set rfds, wfds;
/* We need to have a copy of the fd sets as it's not safe to reuse
* FD sets after select(). */
fd_set _rfds, _wfds;
} aeApiState;
static int aeApiCreate(aeEventLoop *eventLoop) {
aeApiState *state = zmalloc(sizeof(aeApiState));
if (!state) return -1;
FD_ZERO(&state->rfds);
FD_ZERO(&state->wfds);
eventLoop->apidata = state;
return 0;
}
static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
/* Just ensure we have enough room in the fd_set type. */
if (setsize >= FD_SETSIZE) return -1;
return 0;
}
static void aeApiFree(aeEventLoop *eventLoop) {
zfree(eventLoop->apidata);
}
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
if (mask & AE_READABLE) FD_SET(fd,&state->rfds);
if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds);
return 0;
}
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
if (mask & AE_READABLE) FD_CLR(fd,&state->rfds);
if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds);
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, j, numevents = 0;
memcpy(&state->_rfds,&state->rfds,sizeof(fd_set));
memcpy(&state->_wfds,&state->wfds,sizeof(fd_set));
retval = select(eventLoop->maxfd+1,
&state->_rfds,&state->_wfds,NULL,tvp);
if (retval > 0) {
for (j = 0; j <= eventLoop->maxfd; j++) {
int mask = 0;
aeFileEvent *fe = &eventLoop->events[j];
if (fe->mask == AE_NONE) continue;
if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds))
mask |= AE_READABLE;
if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds))
mask |= AE_WRITABLE;
eventLoop->fired[numevents].fd = j;
eventLoop->fired[numevents].mask = mask;
numevents++;
}
}
return numevents;
}
static char *aeApiName(void) {
return "select";
}
+54
View File
@@ -0,0 +1,54 @@
/*
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CONFIG_H
#define __CONFIG_H
#ifdef __APPLE__
#include <AvailabilityMacros.h>
#endif
/* Test for polling API */
#ifdef __linux__
#define HAVE_EPOLL 1
#endif
#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
#define HAVE_KQUEUE 1
#endif
#ifdef __sun
#include <sys/feature_tests.h>
#ifdef _DTRACE_VERSION
#define HAVE_EVPORT 1
#endif
#endif
#endif
+16
View File
@@ -0,0 +1,16 @@
#ifndef _ZMALLOC_H
#define _ZMALLOC_H
#ifndef zmalloc
#define zmalloc malloc
#endif
#ifndef zfree
#define zfree free
#endif
#ifndef zrealloc
#define zrealloc realloc
#endif
#endif /* _ZMALLOC_H */
+6
View File
@@ -0,0 +1,6 @@
if [ ! -f redis-3.2.3/src/redis-server ]; then
wget http://download.redis.io/releases/redis-3.2.3.tar.gz
tar xvfz redis-3.2.3.tar.gz
cd redis-3.2.3
make
fi
+1023
View File
File diff suppressed because it is too large Load Diff
+238
View File
@@ -0,0 +1,238 @@
/*
Copyright (c) 2008-2016, Troy D. Hanson http://troydhanson.github.com/uthash/
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* a dynamic array implementation using macros
*/
#ifndef UTARRAY_H
#define UTARRAY_H
#define UTARRAY_VERSION 2.0.1
#ifdef __GNUC__
#define _UNUSED_ __attribute__ ((__unused__))
#else
#define _UNUSED_
#endif
#include <stddef.h> /* size_t */
#include <string.h> /* memset, etc */
#include <stdlib.h> /* exit */
#ifndef oom
#define oom() exit(-1)
#endif
typedef void (ctor_f)(void *dst, const void *src);
typedef void (dtor_f)(void *elt);
typedef void (init_f)(void *elt);
typedef struct {
size_t sz;
init_f *init;
ctor_f *copy;
dtor_f *dtor;
} UT_icd;
typedef struct {
unsigned i,n;/* i: index of next available slot, n: num slots */
UT_icd icd; /* initializer, copy and destructor functions */
char *d; /* n slots of size icd->sz*/
} UT_array;
#define utarray_init(a,_icd) do { \
memset(a,0,sizeof(UT_array)); \
(a)->icd = *(_icd); \
} while(0)
#define utarray_done(a) do { \
if ((a)->n) { \
if ((a)->icd.dtor) { \
unsigned _ut_i; \
for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \
(a)->icd.dtor(utarray_eltptr(a,_ut_i)); \
} \
} \
free((a)->d); \
} \
(a)->n=0; \
} while(0)
#define utarray_new(a,_icd) do { \
(a) = (UT_array*)malloc(sizeof(UT_array)); \
if ((a) == NULL) oom(); \
utarray_init(a,_icd); \
} while(0)
#define utarray_free(a) do { \
utarray_done(a); \
free(a); \
} while(0)
#define utarray_reserve(a,by) do { \
if (((a)->i+(by)) > (a)->n) { \
char *utarray_tmp; \
while (((a)->i+(by)) > (a)->n) { (a)->n = ((a)->n ? (2*(a)->n) : 8); } \
utarray_tmp=(char*)realloc((a)->d, (a)->n*(a)->icd.sz); \
if (utarray_tmp == NULL) oom(); \
(a)->d=utarray_tmp; \
} \
} while(0)
#define utarray_push_back(a,p) do { \
utarray_reserve(a,1); \
if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); } \
else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); }; \
} while(0)
#define utarray_pop_back(a) do { \
if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); } \
else { (a)->i--; } \
} while(0)
#define utarray_extend_back(a) do { \
utarray_reserve(a,1); \
if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); } \
else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); } \
(a)->i++; \
} while(0)
#define utarray_len(a) ((a)->i)
#define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL)
#define _utarray_eltptr(a,j) ((a)->d + ((a)->icd.sz * (j)))
#define utarray_insert(a,p,j) do { \
if ((j) > (a)->i) utarray_resize(a,j); \
utarray_reserve(a,1); \
if ((j) < (a)->i) { \
memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j), \
((a)->i - (j))*((a)->icd.sz)); \
} \
if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); } \
else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); }; \
(a)->i++; \
} while(0)
#define utarray_inserta(a,w,j) do { \
if (utarray_len(w) == 0) break; \
if ((j) > (a)->i) utarray_resize(a,j); \
utarray_reserve(a,utarray_len(w)); \
if ((j) < (a)->i) { \
memmove(_utarray_eltptr(a,(j)+utarray_len(w)), \
_utarray_eltptr(a,j), \
((a)->i - (j))*((a)->icd.sz)); \
} \
if ((a)->icd.copy) { \
unsigned _ut_i; \
for(_ut_i=0;_ut_i<(w)->i;_ut_i++) { \
(a)->icd.copy(_utarray_eltptr(a, (j) + _ut_i), _utarray_eltptr(w, _ut_i)); \
} \
} else { \
memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0), \
utarray_len(w)*((a)->icd.sz)); \
} \
(a)->i += utarray_len(w); \
} while(0)
#define utarray_resize(dst,num) do { \
unsigned _ut_i; \
if ((dst)->i > (unsigned)(num)) { \
if ((dst)->icd.dtor) { \
for (_ut_i = (num); _ut_i < (dst)->i; ++_ut_i) { \
(dst)->icd.dtor(_utarray_eltptr(dst, _ut_i)); \
} \
} \
} else if ((dst)->i < (unsigned)(num)) { \
utarray_reserve(dst, (num) - (dst)->i); \
if ((dst)->icd.init) { \
for (_ut_i = (dst)->i; _ut_i < (unsigned)(num); ++_ut_i) { \
(dst)->icd.init(_utarray_eltptr(dst, _ut_i)); \
} \
} else { \
memset(_utarray_eltptr(dst, (dst)->i), 0, (dst)->icd.sz*((num) - (dst)->i)); \
} \
} \
(dst)->i = (num); \
} while(0)
#define utarray_concat(dst,src) do { \
utarray_inserta(dst, src, utarray_len(dst)); \
} while(0)
#define utarray_erase(a,pos,len) do { \
if ((a)->icd.dtor) { \
unsigned _ut_i; \
for (_ut_i = 0; _ut_i < (len); _ut_i++) { \
(a)->icd.dtor(utarray_eltptr(a, (pos) + _ut_i)); \
} \
} \
if ((a)->i > ((pos) + (len))) { \
memmove(_utarray_eltptr(a, pos), _utarray_eltptr(a, (pos) + (len)), \
((a)->i - ((pos) + (len))) * (a)->icd.sz); \
} \
(a)->i -= (len); \
} while(0)
#define utarray_renew(a,u) do { \
if (a) utarray_clear(a); \
else utarray_new(a, u); \
} while(0)
#define utarray_clear(a) do { \
if ((a)->i > 0) { \
if ((a)->icd.dtor) { \
unsigned _ut_i; \
for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \
(a)->icd.dtor(_utarray_eltptr(a, _ut_i)); \
} \
} \
(a)->i = 0; \
} \
} while(0)
#define utarray_sort(a,cmp) do { \
qsort((a)->d, (a)->i, (a)->icd.sz, cmp); \
} while(0)
#define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp)
#define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL)
#define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL))
#define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL))
#define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL)
#define utarray_eltidx(a,e) (((char*)(e) >= (a)->d) ? (((char*)(e) - (a)->d)/(a)->icd.sz) : -1)
/* last we pre-define a few icd for common utarrays of ints and strings */
static void utarray_str_cpy(void *dst, const void *src) {
char **_src = (char**)src, **_dst = (char**)dst;
*_dst = (*_src == NULL) ? NULL : strdup(*_src);
}
static void utarray_str_dtor(void *elt) {
char **eltc = (char**)elt;
if (*eltc != NULL) free(*eltc);
}
static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor};
static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL};
static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL};
#endif /* UTARRAY_H */
+1074
View File
File diff suppressed because it is too large Load Diff
+895
View File
@@ -0,0 +1,895 @@
/*
Copyright (c) 2007-2016, Troy D. Hanson http://troydhanson.github.com/uthash/
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UTLIST_H
#define UTLIST_H
#define UTLIST_VERSION 2.0.1
#include <assert.h>
/*
* This file contains macros to manipulate singly and doubly-linked lists.
*
* 1. LL_ macros: singly-linked lists.
* 2. DL_ macros: doubly-linked lists.
* 3. CDL_ macros: circular doubly-linked lists.
*
* To use singly-linked lists, your structure must have a "next" pointer.
* To use doubly-linked lists, your structure must "prev" and "next" pointers.
* Either way, the pointer to the head of the list must be initialized to NULL.
*
* ----------------.EXAMPLE -------------------------
* struct item {
* int id;
* struct item *prev, *next;
* }
*
* struct item *list = NULL:
*
* int main() {
* struct item *item;
* ... allocate and populate item ...
* DL_APPEND(list, item);
* }
* --------------------------------------------------
*
* For doubly-linked lists, the append and delete macros are O(1)
* For singly-linked lists, append and delete are O(n) but prepend is O(1)
* The sort macro is O(n log(n)) for all types of single/double/circular lists.
*/
/* These macros use decltype or the earlier __typeof GNU extension.
As decltype is only available in newer compilers (VS2010 or gcc 4.3+
when compiling c++ code), this code uses whatever method is needed
or, for VS2008 where neither is available, uses casting workarounds. */
#ifdef _MSC_VER /* MS compiler */
#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */
#define LDECLTYPE(x) decltype(x)
#else /* VS2008 or older (or VS2010 in C mode) */
#define NO_DECLTYPE
#endif
#elif defined(__ICCARM__)
#define NO_DECLTYPE
#else /* GNU, Sun and other compilers */
#define LDECLTYPE(x) __typeof(x)
#endif
/* for VS2008 we use some workarounds to get around the lack of decltype,
* namely, we always reassign our tmp variable to the list head if we need
* to dereference its prev/next pointers, and save/restore the real head.*/
#ifdef NO_DECLTYPE
#define IF_NO_DECLTYPE(x) x
#define LDECLTYPE(x) char*
#define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); }
#define _NEXT(elt,list,next) ((char*)((list)->next))
#define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); }
/* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */
#define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); }
#define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; }
#define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); }
#else
#define IF_NO_DECLTYPE(x)
#define _SV(elt,list)
#define _NEXT(elt,list,next) ((elt)->next)
#define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to)
/* #define _PREV(elt,list,prev) ((elt)->prev) */
#define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to)
#define _RS(list)
#define _CASTASGN(a,b) (a)=(b)
#endif
/******************************************************************************
* The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort *
* Unwieldy variable names used here to avoid shadowing passed-in variables. *
*****************************************************************************/
#define LL_SORT(list, cmp) \
LL_SORT2(list, cmp, next)
#define LL_SORT2(list, cmp, next) \
do { \
LDECLTYPE(list) _ls_p; \
LDECLTYPE(list) _ls_q; \
LDECLTYPE(list) _ls_e; \
LDECLTYPE(list) _ls_tail; \
IF_NO_DECLTYPE(LDECLTYPE(list) _tmp;) \
int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \
if (list) { \
_ls_insize = 1; \
_ls_looping = 1; \
while (_ls_looping) { \
_CASTASGN(_ls_p,list); \
(list) = NULL; \
_ls_tail = NULL; \
_ls_nmerges = 0; \
while (_ls_p) { \
_ls_nmerges++; \
_ls_q = _ls_p; \
_ls_psize = 0; \
for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \
_ls_psize++; \
_SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \
if (!_ls_q) break; \
} \
_ls_qsize = _ls_insize; \
while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \
if (_ls_psize == 0) { \
_ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
_NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
} else if (_ls_qsize == 0 || !_ls_q) { \
_ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
_NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
} else if (cmp(_ls_p,_ls_q) <= 0) { \
_ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
_NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
} else { \
_ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
_NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
} \
if (_ls_tail) { \
_SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \
} else { \
_CASTASGN(list,_ls_e); \
} \
_ls_tail = _ls_e; \
} \
_ls_p = _ls_q; \
} \
if (_ls_tail) { \
_SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \
} \
if (_ls_nmerges <= 1) { \
_ls_looping=0; \
} \
_ls_insize *= 2; \
} \
} \
} while (0)
#define DL_SORT(list, cmp) \
DL_SORT2(list, cmp, prev, next)
#define DL_SORT2(list, cmp, prev, next) \
do { \
LDECLTYPE(list) _ls_p; \
LDECLTYPE(list) _ls_q; \
LDECLTYPE(list) _ls_e; \
LDECLTYPE(list) _ls_tail; \
IF_NO_DECLTYPE(LDECLTYPE(list) _tmp;) \
int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \
if (list) { \
_ls_insize = 1; \
_ls_looping = 1; \
while (_ls_looping) { \
_CASTASGN(_ls_p,list); \
(list) = NULL; \
_ls_tail = NULL; \
_ls_nmerges = 0; \
while (_ls_p) { \
_ls_nmerges++; \
_ls_q = _ls_p; \
_ls_psize = 0; \
for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \
_ls_psize++; \
_SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \
if (!_ls_q) break; \
} \
_ls_qsize = _ls_insize; \
while ((_ls_psize > 0) || ((_ls_qsize > 0) && _ls_q)) { \
if (_ls_psize == 0) { \
_ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
_NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
} else if ((_ls_qsize == 0) || (!_ls_q)) { \
_ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
_NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
} else if (cmp(_ls_p,_ls_q) <= 0) { \
_ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
_NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
} else { \
_ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
_NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
} \
if (_ls_tail) { \
_SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \
} else { \
_CASTASGN(list,_ls_e); \
} \
_SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \
_ls_tail = _ls_e; \
} \
_ls_p = _ls_q; \
} \
_CASTASGN((list)->prev, _ls_tail); \
_SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \
if (_ls_nmerges <= 1) { \
_ls_looping=0; \
} \
_ls_insize *= 2; \
} \
} \
} while (0)
#define CDL_SORT(list, cmp) \
CDL_SORT2(list, cmp, prev, next)
#define CDL_SORT2(list, cmp, prev, next) \
do { \
LDECLTYPE(list) _ls_p; \
LDECLTYPE(list) _ls_q; \
LDECLTYPE(list) _ls_e; \
LDECLTYPE(list) _ls_tail; \
LDECLTYPE(list) _ls_oldhead; \
LDECLTYPE(list) _tmp; \
int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \
if (list) { \
_ls_insize = 1; \
_ls_looping = 1; \
while (_ls_looping) { \
_CASTASGN(_ls_p,list); \
_CASTASGN(_ls_oldhead,list); \
(list) = NULL; \
_ls_tail = NULL; \
_ls_nmerges = 0; \
while (_ls_p) { \
_ls_nmerges++; \
_ls_q = _ls_p; \
_ls_psize = 0; \
for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \
_ls_psize++; \
_SV(_ls_q,list); \
if (_NEXT(_ls_q,list,next) == _ls_oldhead) { \
_ls_q = NULL; \
} else { \
_ls_q = _NEXT(_ls_q,list,next); \
} \
_RS(list); \
if (!_ls_q) break; \
} \
_ls_qsize = _ls_insize; \
while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \
if (_ls_psize == 0) { \
_ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
_NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \
} else if (_ls_qsize == 0 || !_ls_q) { \
_ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
_NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \
} else if (cmp(_ls_p,_ls_q) <= 0) { \
_ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
_NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \
} else { \
_ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
_NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \
} \
if (_ls_tail) { \
_SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \
} else { \
_CASTASGN(list,_ls_e); \
} \
_SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \
_ls_tail = _ls_e; \
} \
_ls_p = _ls_q; \
} \
_CASTASGN((list)->prev,_ls_tail); \
_CASTASGN(_tmp,list); \
_SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list); \
if (_ls_nmerges <= 1) { \
_ls_looping=0; \
} \
_ls_insize *= 2; \
} \
} \
} while (0)
/******************************************************************************
* singly linked list macros (non-circular) *
*****************************************************************************/
#define LL_PREPEND(head,add) \
LL_PREPEND2(head,add,next)
#define LL_PREPEND2(head,add,next) \
do { \
(add)->next = (head); \
(head) = (add); \
} while (0)
#define LL_CONCAT(head1,head2) \
LL_CONCAT2(head1,head2,next)
#define LL_CONCAT2(head1,head2,next) \
do { \
LDECLTYPE(head1) _tmp; \
if (head1) { \
_tmp = (head1); \
while (_tmp->next) { _tmp = _tmp->next; } \
_tmp->next=(head2); \
} else { \
(head1)=(head2); \
} \
} while (0)
#define LL_APPEND(head,add) \
LL_APPEND2(head,add,next)
#define LL_APPEND2(head,add,next) \
do { \
LDECLTYPE(head) _tmp; \
(add)->next=NULL; \
if (head) { \
_tmp = (head); \
while (_tmp->next) { _tmp = _tmp->next; } \
_tmp->next=(add); \
} else { \
(head)=(add); \
} \
} while (0)
#define LL_DELETE(head,del) \
LL_DELETE2(head,del,next)
#define LL_DELETE2(head,del,next) \
do { \
LDECLTYPE(head) _tmp; \
if ((head) == (del)) { \
(head)=(head)->next; \
} else { \
_tmp = (head); \
while (_tmp->next && (_tmp->next != (del))) { \
_tmp = _tmp->next; \
} \
if (_tmp->next) { \
_tmp->next = (del)->next; \
} \
} \
} while (0)
#define LL_COUNT(head,el,counter) \
LL_COUNT2(head,el,counter,next) \
#define LL_COUNT2(head,el,counter,next) \
do { \
(counter) = 0; \
LL_FOREACH2(head,el,next) { ++(counter); } \
} while (0)
#define LL_FOREACH(head,el) \
LL_FOREACH2(head,el,next)
#define LL_FOREACH2(head,el,next) \
for ((el) = (head); el; (el) = (el)->next)
#define LL_FOREACH_SAFE(head,el,tmp) \
LL_FOREACH_SAFE2(head,el,tmp,next)
#define LL_FOREACH_SAFE2(head,el,tmp,next) \
for ((el) = (head); (el) && ((tmp) = (el)->next, 1); (el) = (tmp))
#define LL_SEARCH_SCALAR(head,out,field,val) \
LL_SEARCH_SCALAR2(head,out,field,val,next)
#define LL_SEARCH_SCALAR2(head,out,field,val,next) \
do { \
LL_FOREACH2(head,out,next) { \
if ((out)->field == (val)) break; \
} \
} while (0)
#define LL_SEARCH(head,out,elt,cmp) \
LL_SEARCH2(head,out,elt,cmp,next)
#define LL_SEARCH2(head,out,elt,cmp,next) \
do { \
LL_FOREACH2(head,out,next) { \
if ((cmp(out,elt))==0) break; \
} \
} while (0)
#define LL_REPLACE_ELEM2(head, el, add, next) \
do { \
LDECLTYPE(head) _tmp; \
assert((head) != NULL); \
assert((el) != NULL); \
assert((add) != NULL); \
(add)->next = (el)->next; \
if ((head) == (el)) { \
(head) = (add); \
} else { \
_tmp = (head); \
while (_tmp->next && (_tmp->next != (el))) { \
_tmp = _tmp->next; \
} \
if (_tmp->next) { \
_tmp->next = (add); \
} \
} \
} while (0)
#define LL_REPLACE_ELEM(head, el, add) \
LL_REPLACE_ELEM2(head, el, add, next)
#define LL_PREPEND_ELEM2(head, el, add, next) \
do { \
if (el) { \
LDECLTYPE(head) _tmp; \
assert((head) != NULL); \
assert((add) != NULL); \
(add)->next = (el); \
if ((head) == (el)) { \
(head) = (add); \
} else { \
_tmp = (head); \
while (_tmp->next && (_tmp->next != (el))) { \
_tmp = _tmp->next; \
} \
if (_tmp->next) { \
_tmp->next = (add); \
} \
} \
} else { \
LL_APPEND2(head, add, next); \
} \
} while (0) \
#define LL_PREPEND_ELEM(head, el, add) \
LL_PREPEND_ELEM2(head, el, add, next)
#define LL_APPEND_ELEM2(head, el, add, next) \
do { \
if (el) { \
assert((head) != NULL); \
assert((add) != NULL); \
(add)->next = (el)->next; \
(el)->next = (add); \
} else { \
LL_PREPEND2(head, add, next); \
} \
} while (0) \
#define LL_APPEND_ELEM(head, el, add) \
LL_APPEND_ELEM2(head, el, add, next)
#ifdef NO_DECLTYPE
/* Here are VS2008 / NO_DECLTYPE replacements for a few functions */
#undef LL_CONCAT2
#define LL_CONCAT2(head1,head2,next) \
do { \
char *_tmp; \
if (head1) { \
_tmp = (char*)(head1); \
while ((head1)->next) { (head1) = (head1)->next; } \
(head1)->next = (head2); \
_RS(head1); \
} else { \
(head1)=(head2); \
} \
} while (0)
#undef LL_APPEND2
#define LL_APPEND2(head,add,next) \
do { \
if (head) { \
(add)->next = head; /* use add->next as a temp variable */ \
while ((add)->next->next) { (add)->next = (add)->next->next; } \
(add)->next->next=(add); \
} else { \
(head)=(add); \
} \
(add)->next=NULL; \
} while (0)
#undef LL_DELETE2
#define LL_DELETE2(head,del,next) \
do { \
if ((head) == (del)) { \
(head)=(head)->next; \
} else { \
char *_tmp = (char*)(head); \
while ((head)->next && ((head)->next != (del))) { \
(head) = (head)->next; \
} \
if ((head)->next) { \
(head)->next = ((del)->next); \
} \
_RS(head); \
} \
} while (0)
#undef LL_REPLACE_ELEM2
#define LL_REPLACE_ELEM2(head, el, add, next) \
do { \
assert((head) != NULL); \
assert((el) != NULL); \
assert((add) != NULL); \
if ((head) == (el)) { \
(head) = (add); \
} else { \
(add)->next = head; \
while ((add)->next->next && ((add)->next->next != (el))) { \
(add)->next = (add)->next->next; \
} \
if ((add)->next->next) { \
(add)->next->next = (add); \
} \
} \
(add)->next = (el)->next; \
} while (0)
#undef LL_PREPEND_ELEM2
#define LL_PREPEND_ELEM2(head, el, add, next) \
do { \
if (el) { \
assert((head) != NULL); \
assert((add) != NULL); \
if ((head) == (el)) { \
(head) = (add); \
} else { \
(add)->next = (head); \
while ((add)->next->next && ((add)->next->next != (el))) { \
(add)->next = (add)->next->next; \
} \
if ((add)->next->next) { \
(add)->next->next = (add); \
} \
} \
(add)->next = (el); \
} else { \
LL_APPEND2(head, add, next); \
} \
} while (0) \
#endif /* NO_DECLTYPE */
/******************************************************************************
* doubly linked list macros (non-circular) *
*****************************************************************************/
#define DL_PREPEND(head,add) \
DL_PREPEND2(head,add,prev,next)
#define DL_PREPEND2(head,add,prev,next) \
do { \
(add)->next = (head); \
if (head) { \
(add)->prev = (head)->prev; \
(head)->prev = (add); \
} else { \
(add)->prev = (add); \
} \
(head) = (add); \
} while (0)
#define DL_APPEND(head,add) \
DL_APPEND2(head,add,prev,next)
#define DL_APPEND2(head,add,prev,next) \
do { \
if (head) { \
(add)->prev = (head)->prev; \
(head)->prev->next = (add); \
(head)->prev = (add); \
(add)->next = NULL; \
} else { \
(head)=(add); \
(head)->prev = (head); \
(head)->next = NULL; \
} \
} while (0)
#define DL_CONCAT(head1,head2) \
DL_CONCAT2(head1,head2,prev,next)
#define DL_CONCAT2(head1,head2,prev,next) \
do { \
LDECLTYPE(head1) _tmp; \
if (head2) { \
if (head1) { \
_CASTASGN(_tmp, (head2)->prev); \
(head2)->prev = (head1)->prev; \
(head1)->prev->next = (head2); \
_CASTASGN((head1)->prev, _tmp); \
} else { \
(head1)=(head2); \
} \
} \
} while (0)
#define DL_DELETE(head,del) \
DL_DELETE2(head,del,prev,next)
#define DL_DELETE2(head,del,prev,next) \
do { \
assert((del)->prev != NULL); \
if ((del)->prev == (del)) { \
(head)=NULL; \
} else if ((del)==(head)) { \
(del)->next->prev = (del)->prev; \
(head) = (del)->next; \
} else { \
(del)->prev->next = (del)->next; \
if ((del)->next) { \
(del)->next->prev = (del)->prev; \
} else { \
(head)->prev = (del)->prev; \
} \
} \
} while (0)
#define DL_COUNT(head,el,counter) \
DL_COUNT2(head,el,counter,next) \
#define DL_COUNT2(head,el,counter,next) \
do { \
(counter) = 0; \
DL_FOREACH2(head,el,next) { ++(counter); } \
} while (0)
#define DL_FOREACH(head,el) \
DL_FOREACH2(head,el,next)
#define DL_FOREACH2(head,el,next) \
for ((el) = (head); el; (el) = (el)->next)
/* this version is safe for deleting the elements during iteration */
#define DL_FOREACH_SAFE(head,el,tmp) \
DL_FOREACH_SAFE2(head,el,tmp,next)
#define DL_FOREACH_SAFE2(head,el,tmp,next) \
for ((el) = (head); (el) && ((tmp) = (el)->next, 1); (el) = (tmp))
/* these are identical to their singly-linked list counterparts */
#define DL_SEARCH_SCALAR LL_SEARCH_SCALAR
#define DL_SEARCH LL_SEARCH
#define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2
#define DL_SEARCH2 LL_SEARCH2
#define DL_REPLACE_ELEM2(head, el, add, prev, next) \
do { \
assert((head) != NULL); \
assert((el) != NULL); \
assert((add) != NULL); \
if ((head) == (el)) { \
(head) = (add); \
(add)->next = (el)->next; \
if ((el)->next == NULL) { \
(add)->prev = (add); \
} else { \
(add)->prev = (el)->prev; \
(add)->next->prev = (add); \
} \
} else { \
(add)->next = (el)->next; \
(add)->prev = (el)->prev; \
(add)->prev->next = (add); \
if ((el)->next == NULL) { \
(head)->prev = (add); \
} else { \
(add)->next->prev = (add); \
} \
} \
} while (0)
#define DL_REPLACE_ELEM(head, el, add) \
DL_REPLACE_ELEM2(head, el, add, prev, next)
#define DL_PREPEND_ELEM2(head, el, add, prev, next) \
do { \
if (el) { \
assert((head) != NULL); \
assert((add) != NULL); \
(add)->next = (el); \
(add)->prev = (el)->prev; \
(el)->prev = (add); \
if ((head) == (el)) { \
(head) = (add); \
} else { \
(add)->prev->next = (add); \
} \
} else { \
DL_APPEND2(head, add, prev, next); \
} \
} while (0) \
#define DL_PREPEND_ELEM(head, el, add) \
DL_PREPEND_ELEM2(head, el, add, prev, next)
#define DL_APPEND_ELEM2(head, el, add, prev, next) \
do { \
if (el) { \
assert((head) != NULL); \
assert((add) != NULL); \
(add)->next = (el)->next; \
(add)->prev = (el); \
(el)->next = (add); \
if ((add)->next) { \
(add)->next->prev = (add); \
} else { \
(head)->prev = (add); \
} \
} else { \
DL_PREPEND2(head, add, prev, next); \
} \
} while (0) \
#define DL_APPEND_ELEM(head, el, add) \
DL_APPEND_ELEM2(head, el, add, prev, next)
/******************************************************************************
* circular doubly linked list macros *
*****************************************************************************/
#define CDL_APPEND(head,add) \
CDL_APPEND2(head,add,prev,next)
#define CDL_APPEND2(head,add,prev,next) \
do { \
if (head) { \
(add)->prev = (head)->prev; \
(add)->next = (head); \
(head)->prev = (add); \
(add)->prev->next = (add); \
} else { \
(add)->prev = (add); \
(add)->next = (add); \
(head) = (add); \
} \
} while (0)
#define CDL_PREPEND(head,add) \
CDL_PREPEND2(head,add,prev,next)
#define CDL_PREPEND2(head,add,prev,next) \
do { \
if (head) { \
(add)->prev = (head)->prev; \
(add)->next = (head); \
(head)->prev = (add); \
(add)->prev->next = (add); \
} else { \
(add)->prev = (add); \
(add)->next = (add); \
} \
(head) = (add); \
} while (0)
#define CDL_DELETE(head,del) \
CDL_DELETE2(head,del,prev,next)
#define CDL_DELETE2(head,del,prev,next) \
do { \
if (((head)==(del)) && ((head)->next == (head))) { \
(head) = NULL; \
} else { \
(del)->next->prev = (del)->prev; \
(del)->prev->next = (del)->next; \
if ((del) == (head)) (head)=(del)->next; \
} \
} while (0)
#define CDL_COUNT(head,el,counter) \
CDL_COUNT2(head,el,counter,next) \
#define CDL_COUNT2(head, el, counter,next) \
do { \
(counter) = 0; \
CDL_FOREACH2(head,el,next) { ++(counter); } \
} while (0)
#define CDL_FOREACH(head,el) \
CDL_FOREACH2(head,el,next)
#define CDL_FOREACH2(head,el,next) \
for ((el)=(head);el;(el)=(((el)->next==(head)) ? NULL : (el)->next))
#define CDL_FOREACH_SAFE(head,el,tmp1,tmp2) \
CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next)
#define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) \
for ((el) = (head), (tmp1) = (head) ? (head)->prev : NULL; \
(el) && ((tmp2) = (el)->next, 1); \
(el) = ((el) == (tmp1) ? NULL : (tmp2)))
#define CDL_SEARCH_SCALAR(head,out,field,val) \
CDL_SEARCH_SCALAR2(head,out,field,val,next)
#define CDL_SEARCH_SCALAR2(head,out,field,val,next) \
do { \
CDL_FOREACH2(head,out,next) { \
if ((out)->field == (val)) break; \
} \
} while (0)
#define CDL_SEARCH(head,out,elt,cmp) \
CDL_SEARCH2(head,out,elt,cmp,next)
#define CDL_SEARCH2(head,out,elt,cmp,next) \
do { \
CDL_FOREACH2(head,out,next) { \
if ((cmp(out,elt))==0) break; \
} \
} while (0)
#define CDL_REPLACE_ELEM2(head, el, add, prev, next) \
do { \
assert((head) != NULL); \
assert((el) != NULL); \
assert((add) != NULL); \
if ((el)->next == (el)) { \
(add)->next = (add); \
(add)->prev = (add); \
(head) = (add); \
} else { \
(add)->next = (el)->next; \
(add)->prev = (el)->prev; \
(add)->next->prev = (add); \
(add)->prev->next = (add); \
if ((head) == (el)) { \
(head) = (add); \
} \
} \
} while (0)
#define CDL_REPLACE_ELEM(head, el, add) \
CDL_REPLACE_ELEM2(head, el, add, prev, next)
#define CDL_PREPEND_ELEM2(head, el, add, prev, next) \
do { \
if (el) { \
assert((head) != NULL); \
assert((add) != NULL); \
(add)->next = (el); \
(add)->prev = (el)->prev; \
(el)->prev = (add); \
(add)->prev->next = (add); \
if ((head) == (el)) { \
(head) = (add); \
} \
} else { \
CDL_APPEND2(head, add, prev, next); \
} \
} while (0)
#define CDL_PREPEND_ELEM(head, el, add) \
CDL_PREPEND_ELEM2(head, el, add, prev, next)
#define CDL_APPEND_ELEM2(head, el, add, prev, next) \
do { \
if (el) { \
assert((head) != NULL); \
assert((add) != NULL); \
(add)->next = (el)->next; \
(add)->prev = (el); \
(el)->next = (add); \
(add)->next->prev = (add); \
} else { \
CDL_PREPEND2(head, add, prev, next); \
} \
} while (0)
#define CDL_APPEND_ELEM(head, el, add) \
CDL_APPEND_ELEM2(head, el, add, prev, next)
#endif /* UTLIST_H */
+398
View File
@@ -0,0 +1,398 @@
/*
Copyright (c) 2008-2016, Troy D. Hanson http://troydhanson.github.com/uthash/
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* a dynamic string implementation using macros
*/
#ifndef UTSTRING_H
#define UTSTRING_H
#define UTSTRING_VERSION 2.0.1
#ifdef __GNUC__
#define _UNUSED_ __attribute__ ((__unused__))
#else
#define _UNUSED_
#endif
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdarg.h>
#ifndef oom
#define oom() exit(-1)
#endif
typedef struct {
char *d;
size_t n; /* allocd size */
size_t i; /* index of first unused byte */
} UT_string;
#define utstring_reserve(s,amt) \
do { \
if (((s)->n - (s)->i) < (size_t)(amt)) { \
char *utstring_tmp = (char*)realloc( \
(s)->d, (s)->n + (amt)); \
if (utstring_tmp == NULL) oom(); \
(s)->d = utstring_tmp; \
(s)->n += (amt); \
} \
} while(0)
#define utstring_init(s) \
do { \
(s)->n = 0; (s)->i = 0; (s)->d = NULL; \
utstring_reserve(s,100); \
(s)->d[0] = '\0'; \
} while(0)
#define utstring_done(s) \
do { \
if ((s)->d != NULL) free((s)->d); \
(s)->n = 0; \
} while(0)
#define utstring_free(s) \
do { \
utstring_done(s); \
free(s); \
} while(0)
#define utstring_new(s) \
do { \
s = (UT_string*)calloc(sizeof(UT_string),1); \
if (!s) oom(); \
utstring_init(s); \
} while(0)
#define utstring_renew(s) \
do { \
if (s) { \
utstring_clear(s); \
} else { \
utstring_new(s); \
} \
} while(0)
#define utstring_clear(s) \
do { \
(s)->i = 0; \
(s)->d[0] = '\0'; \
} while(0)
#define utstring_bincpy(s,b,l) \
do { \
utstring_reserve((s),(l)+1); \
if (l) memcpy(&(s)->d[(s)->i], b, l); \
(s)->i += (l); \
(s)->d[(s)->i]='\0'; \
} while(0)
#define utstring_concat(dst,src) \
do { \
utstring_reserve((dst),((src)->i)+1); \
if ((src)->i) memcpy(&(dst)->d[(dst)->i], (src)->d, (src)->i); \
(dst)->i += (src)->i; \
(dst)->d[(dst)->i]='\0'; \
} while(0)
#define utstring_len(s) ((unsigned)((s)->i))
#define utstring_body(s) ((s)->d)
_UNUSED_ static void utstring_printf_va(UT_string *s, const char *fmt, va_list ap) {
int n;
va_list cp;
for (;;) {
#ifdef _WIN32
cp = ap;
#else
va_copy(cp, ap);
#endif
n = vsnprintf (&s->d[s->i], s->n-s->i, fmt, cp);
va_end(cp);
if ((n > -1) && ((size_t) n < (s->n-s->i))) {
s->i += n;
return;
}
/* Else try again with more space. */
if (n > -1) utstring_reserve(s,n+1); /* exact */
else utstring_reserve(s,(s->n)*2); /* 2x */
}
}
#ifdef __GNUC__
/* support printf format checking (2=the format string, 3=start of varargs) */
static void utstring_printf(UT_string *s, const char *fmt, ...)
__attribute__ (( format( printf, 2, 3) ));
#endif
_UNUSED_ static void utstring_printf(UT_string *s, const char *fmt, ...) {
va_list ap;
va_start(ap,fmt);
utstring_printf_va(s,fmt,ap);
va_end(ap);
}
/*******************************************************************************
* begin substring search functions *
******************************************************************************/
/* Build KMP table from left to right. */
_UNUSED_ static void _utstring_BuildTable(
const char *P_Needle,
size_t P_NeedleLen,
long *P_KMP_Table)
{
long i, j;
i = 0;
j = i - 1;
P_KMP_Table[i] = j;
while (i < (long) P_NeedleLen)
{
while ( (j > -1) && (P_Needle[i] != P_Needle[j]) )
{
j = P_KMP_Table[j];
}
i++;
j++;
if (i < (long) P_NeedleLen)
{
if (P_Needle[i] == P_Needle[j])
{
P_KMP_Table[i] = P_KMP_Table[j];
}
else
{
P_KMP_Table[i] = j;
}
}
else
{
P_KMP_Table[i] = j;
}
}
return;
}
/* Build KMP table from right to left. */
_UNUSED_ static void _utstring_BuildTableR(
const char *P_Needle,
size_t P_NeedleLen,
long *P_KMP_Table)
{
long i, j;
i = P_NeedleLen - 1;
j = i + 1;
P_KMP_Table[i + 1] = j;
while (i >= 0)
{
while ( (j < (long) P_NeedleLen) && (P_Needle[i] != P_Needle[j]) )
{
j = P_KMP_Table[j + 1];
}
i--;
j--;
if (i >= 0)
{
if (P_Needle[i] == P_Needle[j])
{
P_KMP_Table[i + 1] = P_KMP_Table[j + 1];
}
else
{
P_KMP_Table[i + 1] = j;
}
}
else
{
P_KMP_Table[i + 1] = j;
}
}
return;
}
/* Search data from left to right. ( Multiple search mode. ) */
_UNUSED_ static long _utstring_find(
const char *P_Haystack,
size_t P_HaystackLen,
const char *P_Needle,
size_t P_NeedleLen,
long *P_KMP_Table)
{
long i, j;
long V_FindPosition = -1;
/* Search from left to right. */
i = j = 0;
while ( (j < (int)P_HaystackLen) && (((P_HaystackLen - j) + i) >= P_NeedleLen) )
{
while ( (i > -1) && (P_Needle[i] != P_Haystack[j]) )
{
i = P_KMP_Table[i];
}
i++;
j++;
if (i >= (int)P_NeedleLen)
{
/* Found. */
V_FindPosition = j - i;
break;
}
}
return V_FindPosition;
}
/* Search data from right to left. ( Multiple search mode. ) */
_UNUSED_ static long _utstring_findR(
const char *P_Haystack,
size_t P_HaystackLen,
const char *P_Needle,
size_t P_NeedleLen,
long *P_KMP_Table)
{
long i, j;
long V_FindPosition = -1;
/* Search from right to left. */
j = (P_HaystackLen - 1);
i = (P_NeedleLen - 1);
while ( (j >= 0) && (j >= i) )
{
while ( (i < (int)P_NeedleLen) && (P_Needle[i] != P_Haystack[j]) )
{
i = P_KMP_Table[i + 1];
}
i--;
j--;
if (i < 0)
{
/* Found. */
V_FindPosition = j + 1;
break;
}
}
return V_FindPosition;
}
/* Search data from left to right. ( One time search mode. ) */
_UNUSED_ static long utstring_find(
UT_string *s,
long P_StartPosition, /* Start from 0. -1 means last position. */
const char *P_Needle,
size_t P_NeedleLen)
{
long V_StartPosition;
long V_HaystackLen;
long *V_KMP_Table;
long V_FindPosition = -1;
if (P_StartPosition < 0)
{
V_StartPosition = s->i + P_StartPosition;
}
else
{
V_StartPosition = P_StartPosition;
}
V_HaystackLen = s->i - V_StartPosition;
if ( (V_HaystackLen >= (long) P_NeedleLen) && (P_NeedleLen > 0) )
{
V_KMP_Table = (long *)malloc(sizeof(long) * (P_NeedleLen + 1));
if (V_KMP_Table != NULL)
{
_utstring_BuildTable(P_Needle, P_NeedleLen, V_KMP_Table);
V_FindPosition = _utstring_find(s->d + V_StartPosition,
V_HaystackLen,
P_Needle,
P_NeedleLen,
V_KMP_Table);
if (V_FindPosition >= 0)
{
V_FindPosition += V_StartPosition;
}
free(V_KMP_Table);
}
}
return V_FindPosition;
}
/* Search data from right to left. ( One time search mode. ) */
_UNUSED_ static long utstring_findR(
UT_string *s,
long P_StartPosition, /* Start from 0. -1 means last position. */
const char *P_Needle,
size_t P_NeedleLen)
{
long V_StartPosition;
long V_HaystackLen;
long *V_KMP_Table;
long V_FindPosition = -1;
if (P_StartPosition < 0)
{
V_StartPosition = s->i + P_StartPosition;
}
else
{
V_StartPosition = P_StartPosition;
}
V_HaystackLen = V_StartPosition + 1;
if ( (V_HaystackLen >= (long) P_NeedleLen) && (P_NeedleLen > 0) )
{
V_KMP_Table = (long *)malloc(sizeof(long) * (P_NeedleLen + 1));
if (V_KMP_Table != NULL)
{
_utstring_BuildTableR(P_Needle, P_NeedleLen, V_KMP_Table);
V_FindPosition = _utstring_findR(s->d,
V_HaystackLen,
P_Needle,
P_NeedleLen,
V_KMP_Table);
free(V_KMP_Table);
}
}
return V_FindPosition;
}
/*******************************************************************************
* end substring search functions *
******************************************************************************/
#endif /* UTSTRING_H */
-27
View File
@@ -1,27 +0,0 @@
#include "computation_graph.h"
OperationId ComputationGraph::add_operation(std::unique_ptr<Operation> operation) {
OperationId operationid = operations_.size();
OperationId creator_operationid = operation->creator_operationid();
RAY_CHECK_EQ(spawned_operations_.size(), operationid, "ComputationGraph is attempting to call add_operation, but spawned_operations_.size() != operationid.");
operations_.emplace_back(std::move(operation));
if (creator_operationid != NO_OPERATION && creator_operationid != ROOT_OPERATION) {
spawned_operations_[creator_operationid].push_back(operationid);
}
spawned_operations_.push_back(std::vector<OperationId>());
return operationid;
}
const Task& ComputationGraph::get_task(OperationId operationid) {
RAY_CHECK_NEQ(operationid, ROOT_OPERATION, "ComputationGraph attempting to get_task with operationid == ROOT_OPERATION");
RAY_CHECK_NEQ(operationid, NO_OPERATION, "ComputationGraph attempting to get_task with operationid == NO_OPERATION");
RAY_CHECK_LT(operationid, operations_.size(), "ComputationGraph attempting to get_task with operationid " << operationid << ", but operationid >= operations_.size().");
RAY_CHECK(operations_[operationid]->has_task(), "Calling get_task with operationid " << operationid << ", but this corresponds to a put not a task.");
return operations_[operationid]->task();
}
void ComputationGraph::to_protobuf(CompGraph* computation_graph) {
for (OperationId id = 0; id < operations_.size(); ++id) {
computation_graph->add_operation()->CopyFrom(*operations_[id]);
}
}
-35
View File
@@ -1,35 +0,0 @@
#ifndef RAY_COMPUTATIONGRAPH_H
#define RAY_COMPUTATIONGRAPH_H
#include <iostream>
#include <limits>
#include "ray/ray.h"
#include "graph.pb.h"
#include "types.pb.h"
// used to represent the root operation (that is, the driver code)
const OperationId ROOT_OPERATION = std::numeric_limits<OperationId>::max();
// used to represent the absence of an operation
const OperationId NO_OPERATION = std::numeric_limits<OperationId>::max() - 1;
class ComputationGraph {
public:
// Add an operation to the computation graph, this returns the OperationId for
// the new operation. This method takes ownership over operation.
OperationId add_operation(std::unique_ptr<Operation> operation);
// Return the task corresponding to a particular OperationId. If operationid
// corresponds to a put, then fail.
const Task& get_task(OperationId operationid);
// Serialize the computation graph to ProtoBuf and store it in computation_graph
void to_protobuf(CompGraph* computation_graph);
private:
// maps an OperationId to the corresponding task or put
std::vector<std::unique_ptr<Operation> > operations_;
// spawned_operations_[operationid] is a vector of the OperationIds of the
// operations spawned by the task with OperationId operationid
std::vector<std::vector<OperationId> > spawned_operations_;
};
#endif
-202
View File
@@ -1,202 +0,0 @@
#include "ipc.h"
#if defined(__unix__) || defined(__linux__)
#include <sys/statvfs.h>
#endif
#include <stdlib.h>
#include "ray/ray.h"
#include "utils.h"
ObjHandle::ObjHandle(SegmentId segmentid, size_t size, IpcPointer ipcpointer, size_t metadata_offset)
: segmentid_(segmentid), size_(size), ipcpointer_(ipcpointer), metadata_offset_(metadata_offset)
{}
MessageQueue<>::MessageQueue() : create_(false) { }
MessageQueue<>::~MessageQueue() {
if (!name_.empty() && create_) {
// Only remove the message queue if we created it.
RAY_LOG(RAY_DEBUG, "Removing message queue " << name_.c_str() << ", create = " << create_);
bip::message_queue::remove(name_.c_str());
}
}
MessageQueue<>::MessageQueue(MessageQueue&& other) {
*this = std::move(other);
}
MessageQueue<>& MessageQueue<>::operator=(MessageQueue&& other) {
name_ = std::move(other.name_);
create_ = other.create_;
queue_ = std::move(other.queue_);
other.name_.clear(); // It is unclear if this is guaranteed, but we need it to hold for the destructor. See: https://stackoverflow.com/a/17735913
return *this;
}
bool MessageQueue<>::connect(const std::string& name, bool create, size_t message_size, size_t message_capacity) {
name_ = name;
name_.insert(0, "ray-{BC200A09-2465-431D-AEC7-2F8530B04535}-");
#if defined(WIN32) || defined(_WIN32)
std::replace(name_.begin(), name_.end(), ':', '-');
#endif
try {
if (create) {
bip::message_queue::remove(name_.c_str()); // remove queue if it has not been properly removed from last run
queue_ = std::unique_ptr<bip::message_queue>(new bip::message_queue(bip::create_only, name_.c_str(), message_capacity, message_size));
create_ = true; // Only set create_ = true on success.
}
else {
queue_ = std::unique_ptr<bip::message_queue>(new bip::message_queue(bip::open_only, name_.c_str()));
}
}
catch (bip::interprocess_exception &ex) {
RAY_CHECK(false, "name = " << name_ << ", create = " << create << ", boost::interprocess exception: " << ex.what());
}
return true;
}
bool MessageQueue<>::connected() {
return queue_ != NULL;
}
bool MessageQueue<>::send(const void * object, size_t size) {
bool succeeded;
try {
// This will return true if the message was successfully sent and false if
// the message queue is full.
succeeded = queue_->try_send(object, size, 0);
}
catch (bip::interprocess_exception &ex) {
RAY_CHECK(false, "boost::interprocess exception: " << ex.what());
}
return succeeded;
}
bool MessageQueue<>::receive(void * object, size_t size) {
unsigned int priority;
bip::message_queue::size_type recvd_size;
try {
queue_->receive(object, size, recvd_size, priority);
}
catch (bip::interprocess_exception &ex) {
RAY_CHECK(false, "boost::interprocess exception: " << ex.what());
}
return true;
}
MemorySegmentPool::MemorySegmentPool(ObjStoreId objstoreid, std::string& objstore_address, bool create) : objstoreid_(objstoreid), objstore_address_(objstore_address), create_mode_(create) {
std::string::iterator split_point = split_ip_address(objstore_address);
objstore_port_.assign(split_point, objstore_address.end());
}
// creates a memory segment if it is not already there; if the pool is in create mode,
// space is allocated, if it is in open mode, the shared memory is mapped into the process
void MemorySegmentPool::open_segment(SegmentId segmentid, size_t size) {
RAY_LOG(RAY_DEBUG, "Opening segmentid " << segmentid << " on object store " << objstoreid_ << " with port " << objstore_port_ << " with create_mode_ = " << create_mode_);
RAY_CHECK(segmentid == segments_.size() || !create_mode_, "Object store " << objstoreid_ << " with port " << objstore_port_ << " is attempting to open segmentid " << segmentid << " on the object store, but segments_.size() = " << segments_.size());
if (segmentid >= segments_.size()) { // resize and initialize segments_
int current_size = segments_.size();
segments_.resize(segmentid + 1);
for (int i = current_size; i < segments_.size(); ++i) {
segments_[i].first = nullptr;
segments_[i].second = SegmentStatusType::UNOPENED;
}
}
if (segments_[segmentid].second == SegmentStatusType::OPENED) {
return;
}
RAY_CHECK_NEQ(segments_[segmentid].second, SegmentStatusType::CLOSED, "Attempting to open segmentid " << segmentid << ", but segments_[segmentid].second == SegmentStatusType::CLOSED.");
std::string segment_name = get_segment_name(segmentid);
if (create_mode_) {
assert(size > 0);
bip::shared_memory_object::remove(segment_name.c_str()); // remove segment if it has not been properly removed from last run
size_t new_size = (size / page_size_ + 2) * page_size_; // additional room for boost's bookkeeping
segments_[segmentid] = std::make_pair(std::unique_ptr<bip::managed_shared_memory>(new bip::managed_shared_memory(bip::create_only, segment_name.c_str(), new_size)), SegmentStatusType::OPENED);
} else {
segments_[segmentid] = std::make_pair(std::unique_ptr<bip::managed_shared_memory>(new bip::managed_shared_memory(bip::open_only, segment_name.c_str())), SegmentStatusType::OPENED);
}
}
void MemorySegmentPool::unmap_segment(SegmentId segmentid) {
segments_[segmentid].first.reset();
segments_[segmentid].second = SegmentStatusType::UNOPENED;
}
void MemorySegmentPool::close_segment(SegmentId segmentid) {
RAY_LOG(RAY_DEBUG, "closing segmentid " << segmentid);
std::string segment_name = get_segment_name(segmentid);
bip::shared_memory_object::remove(segment_name.c_str());
segments_[segmentid].first.reset();
segments_[segmentid].second = SegmentStatusType::CLOSED;
}
ObjHandle MemorySegmentPool::allocate(size_t size) {
RAY_CHECK(create_mode_, "Attempting to call allocate, but create_mode_ is false");
// TODO(pcm): at the moment, this always creates a new segment, this will be changed
SegmentId segmentid = segments_.size();
open_segment(segmentid, size);
objstore_memcheck(size);
void* ptr = segments_[segmentid].first->allocate(size);
auto handle = segments_[segmentid].first->get_handle_from_address(ptr);
return ObjHandle(segmentid, size, handle);
}
void MemorySegmentPool::deallocate(ObjHandle pointer) {
SegmentId segmentid = pointer.segmentid();
void* ptr = segments_[segmentid].first->get_address_from_handle(pointer.ipcpointer());
segments_[segmentid].first->deallocate(ptr);
close_segment(segmentid);
}
// returns address of the object refered to by the handle, needs to be called on
// the process that will use the address
uint8_t* MemorySegmentPool::get_address(ObjHandle pointer) {
RAY_CHECK(!create_mode_ || segments_[pointer.segmentid()].second == SegmentStatusType::OPENED, "Object store " << objstoreid_ << " is attempting to call get_address on segmentid " << pointer.segmentid() << ", which has not been opened yet.");
if (!create_mode_) {
open_segment(pointer.segmentid());
}
bip::managed_shared_memory* segment = segments_[pointer.segmentid()].first.get();
return static_cast<uint8_t*>(segment->get_address_from_handle(pointer.ipcpointer()));
}
// returns the name of the segment
std::string MemorySegmentPool::get_segment_name(SegmentId segmentid) {
return std::string("ray-{BC200A09-2465-431D-AEC7-2F8530B04535}-objstore-") + std::to_string(objstoreid_) + "-" + objstore_port_ + std::string("-segment-") + std::to_string(segmentid);
}
MemorySegmentPool::~MemorySegmentPool() {
destroy_segments();
}
void MemorySegmentPool::objstore_memcheck(int64_t size) {
#if defined(__unix__) || defined(__linux__)
struct statvfs buffer;
statvfs("/dev/shm/", &buffer);
if (size + 100 > buffer.f_bsize * buffer.f_bavail) {
MemorySegmentPool::destroy_segments();
RAY_LOG(RAY_FATAL, "Not enough memory for allocating object in objectstore.");
}
#endif
}
void MemorySegmentPool::destroy_segments() {
for (size_t segmentid = 0; segmentid < segments_.size(); ++segmentid) {
std::string segment_name = get_segment_name(segmentid);
segments_[segmentid].first.reset();
bip::shared_memory_object::remove(segment_name.c_str());
}
}
#if defined(WIN32) || defined(_WIN32)
namespace boost {
namespace interprocess {
namespace ipcdetail {
windows_bootstamp windows_intermodule_singleton<windows_bootstamp>::get() {
// HACK: Only do this for Windows as there seems to be no better workaround. Possibly undefined behavior!
return reinterpret_cast<windows_bootstamp const &>(std::string("BOOTSTAMP"));
}
}
}
}
#endif
-142
View File
@@ -1,142 +0,0 @@
#ifndef RAY_IPC_H
#define RAY_IPC_H
#include <iostream>
#include <limits>
#if defined(WIN32) || defined(_WIN32)
#include <boost/interprocess/detail/windows_intermodule_singleton.hpp>
namespace boost {
namespace interprocess {
namespace ipcdetail {
struct windows_bootstamp;
template<>
class windows_intermodule_singleton<windows_bootstamp> {
public:
static windows_bootstamp get();
};
}
}
}
#endif
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/interprocess/ipc/message_queue.hpp>
#include "ray/ray.h"
namespace bip = boost::interprocess;
// Methods for inter process communication (abstracts from the shared memory implementation)
// Message Queues: Exchanging objects of type T between processes on a node
template<typename T = void>
class MessageQueue;
template<>
class MessageQueue<> {
public:
~MessageQueue();
MessageQueue();
MessageQueue(MessageQueue&& other);
MessageQueue& operator=(MessageQueue&& other);
bool connected();
protected:
bool connect(const std::string& name, bool create, size_t message_size, size_t message_capacity);
bool send(const void* object, size_t size);;
bool receive(void* object, size_t size);
private:
std::string name_;
bool create_;
std::unique_ptr<bip::message_queue> queue_;
};
template<typename T>
class MessageQueue : public MessageQueue<> {
public:
bool connect(const std::string& name, bool create, size_t capacity = 1000) { return MessageQueue<>::connect(name, create, sizeof(T), capacity); }
bool send(const T* object) { return MessageQueue<>::send(object, sizeof(*object)); };
bool receive(T* object) { return MessageQueue<>::receive(object, sizeof(*object)); }
};
// Object Queues
// For communicating between object store and workers, the following
// messages can be sent:
// ALLOC: workerid, objectid, size -> objhandle:
// worker requests an allocation from the object store
// GET: workerid, objectid -> objhandle:
// worker requests an object from the object store
// WORKER_DONE: workerid, objectid -> ():
// worker tells the object store that an object has been finalized
// ALIAS_DONE: objectid -> ():
// objstore tells itself that it has finalized something (perhaps an alias)
enum ObjRequestType {ALLOC = 0, GET = 1, WORKER_DONE = 2, ALIAS_DONE = 3};
struct ObjRequest {
WorkerId workerid; // worker that sends the request
ObjRequestType type; // do we want to allocate a new object or get a handle?
ObjectID objectid; // object ID of the object to be returned/allocated
int64_t size; // if allocate, that's the size of the object
int64_t metadata_offset; // if sending 'WORKER_DONE', that's the location of the metadata relative to the beginning of the object
};
typedef size_t SegmentId; // index into a memory segment table
typedef bip::managed_shared_memory::handle_t IpcPointer;
// Object handle: Handle to object that can be passed around between processes
// that are connected to the same object store
class ObjHandle {
public:
ObjHandle(SegmentId segmentid = 0, size_t size = 0, IpcPointer ipcpointer = IpcPointer(), size_t metadata_offset = 0);
SegmentId segmentid() { return segmentid_; }
size_t size() { return size_; }
IpcPointer ipcpointer() { return ipcpointer_; }
size_t metadata_offset() { return metadata_offset_; }
void set_metadata_offset(size_t metadata_offset) {metadata_offset_ = metadata_offset; }
private:
SegmentId segmentid_; // which shared memory file the object is stored in
IpcPointer ipcpointer_; // pointer to the beginning of the object, exchangeable between processes
size_t size_; // total size of the object
size_t metadata_offset_; // offset of the metadata that describes this object
};
// Memory segment pool: A collection of shared memory segments
// used in two modes:
// \item on the object store it is used with create = true, in this case the
// segments are allocated
// \item on the worker it is used in open mode, with create = false, in this case
// the segments, which have been created by the object store, are just mapped
// into memory
enum SegmentStatusType {UNOPENED = 0, OPENED = 1, CLOSED = 2};
class MemorySegmentPool {
public:
MemorySegmentPool(ObjStoreId objstoreid, std::string& objstore_address, bool create); // can be used in two modes: create mode and open mode (see above)
~MemorySegmentPool();
ObjHandle allocate(size_t nbytes); // allocate memory, potentially creating a new segment (only run on object store)
void deallocate(ObjHandle pointer); // deallocate object, potentially deallocating a new segment (only run on object store)
uint8_t* get_address(ObjHandle pointer); // get address of shared object
std::string get_segment_name(SegmentId segmentid); // get the name of a segment
void unmap_segment(SegmentId segmentid); // unmap a memory segment from a client (only to be called by clients)
void destroy_segments();
void objstore_memcheck(int64_t size);
private:
void open_segment(SegmentId segmentid, size_t size = 0); // create a segment or map an existing one into memory
void close_segment(SegmentId segmentid); // close a segment
bool create_mode_; // true in the object stores, false on the workers
ObjStoreId objstoreid_; // the identity of the associated object store
// The address of the object store.
std::string objstore_address_;
// The port of the object store. This is used to help avoid name collisions.
std::string objstore_port_;
size_t page_size_ = bip::mapped_region::get_page_size();
std::vector<std::pair<std::unique_ptr<bip::managed_shared_memory>, SegmentStatusType> > segments_;
};
#endif
-375
View File
@@ -1,375 +0,0 @@
#include "objstore.h"
#include <chrono>
#include "utils.h"
const size_t ObjStoreService::CHUNK_SIZE = 8 * 1024;
// this method needs to be protected by a objstore_lock_
// TODO(rkn): Make sure that we do not in fact need the objstore_lock_. We want multiple deliveries to be able to happen simultaneously.
void ObjStoreService::get_data_from(ObjectID objectid, ObjStore::Stub& stub) {
RAY_LOG(RAY_DEBUG, "Objstore " << objstoreid_ << " is beginning to get objectid " << objectid);
ObjChunk chunk;
ClientContext context;
StreamObjToRequest stream_request;
stream_request.set_objectid(objectid);
std::unique_ptr<ClientReader<ObjChunk> > reader(stub.StreamObjTo(&context, stream_request));
size_t total_size = 0;
ObjHandle handle;
if (reader->Read(&chunk)) {
total_size = chunk.total_size();
handle = alloc(objectid, total_size);
}
size_t num_bytes = 0;
segmentpool_lock_.lock();
uint8_t* data = segmentpool_->get_address(handle);
segmentpool_lock_.unlock();
do {
RAY_CHECK_LE(num_bytes + chunk.data().size(), total_size, "The reader attempted to stream too many bytes.");
std::memcpy(data, chunk.data().c_str(), chunk.data().size());
data += chunk.data().size();
num_bytes += chunk.data().size();
} while (reader->Read(&chunk));
RAY_CHECK_GRPC(reader->Finish());
// finalize object
RAY_CHECK_EQ(num_bytes, total_size, "Streamed objectid " << objectid << ", but num_bytes != total_size");
object_ready(objectid, chunk.metadata_offset());
RAY_LOG(RAY_DEBUG, "finished streaming data, objectid was " << objectid << " and size was " << num_bytes);
}
ObjStoreService::ObjStoreService(std::shared_ptr<Channel> scheduler_channel)
: scheduler_stub_(Scheduler::NewStub(scheduler_channel)) {
}
void ObjStoreService::register_objstore(const std::string& objstore_address, const std::string& recv_queue_name) {
// Create the queue that will be used by workers to send requests to the
// object store.
RAY_LOG(RAY_INFO, "Object store is creating queue with name " << recv_queue_name);
RAY_CHECK(recv_queue_.connect(recv_queue_name, true), "error connecting recv_queue_");
objstore_address_ = objstore_address;
// Register the object store with the scheduler.
ClientContext context;
RegisterObjStoreRequest request;
request.set_objstore_address(objstore_address);
RegisterObjStoreReply reply;
RAY_CHECK_GRPC(scheduler_stub_->RegisterObjStore(&context, request, &reply));
objstoreid_ = reply.objstoreid();
segmentpool_ = std::make_shared<MemorySegmentPool>(objstoreid_, objstore_address_, true);
}
// this method needs to be protected by a objstores_lock_
ObjStore::Stub& ObjStoreService::get_objstore_stub(const std::string& objstore_address) {
auto iter = objstores_.find(objstore_address);
if (iter != objstores_.end())
return *(iter->second);
auto channel = grpc::CreateChannel(objstore_address, grpc::InsecureChannelCredentials());
objstores_.emplace(objstore_address, ObjStore::NewStub(channel));
return *objstores_[objstore_address];
}
Status ObjStoreService::StartDelivery(ServerContext* context, const StartDeliveryRequest* request, AckReply* reply) {
// TODO(rkn): We're pushing the delivery task onto a new thread so that this method can return immediately. This matters
// because the scheduler holds a lock while DeliverObj is being called. The correct solution is to make DeliverObj
// an asynchronous call (and similarly with the rest of the object store service methods).
std::string address = request->objstore_address();
ObjectID objectid = request->objectid();
{
std::lock_guard<std::mutex> memory_lock(memory_lock_);
if (objectid >= memory_.size()) {
memory_.resize(objectid + 1, std::make_pair(ObjHandle(), MemoryStatusType::NOT_PRESENT));
}
if (memory_[objectid].second == MemoryStatusType::NOT_PRESENT) {
}
else {
RAY_CHECK_NEQ(memory_[objectid].second, MemoryStatusType::DEALLOCATED, "Objstore " << objstoreid_ << " is attempting to get objectid " << objectid << ", but memory_[objectid] == DEALLOCATED.");
RAY_LOG(RAY_DEBUG, "Objstore " << objstoreid_ << " already has objectid " << objectid << " or it is already being shipped, so no need to get it again.");
return Status::OK;
}
memory_[objectid].second = MemoryStatusType::PRE_ALLOCED;
}
delivery_threads_.push_back(std::make_shared<std::thread>([this, address, objectid]() {
std::lock_guard<std::mutex> objstores_lock(objstores_lock_);
ObjStore::Stub& stub = get_objstore_stub(address);
get_data_from(objectid, stub);
}));
return Status::OK;
}
Status ObjStoreService::ObjStoreInfo(ServerContext* context, const ObjStoreInfoRequest* request, ObjStoreInfoReply* reply) {
std::lock_guard<std::mutex> memory_lock(memory_lock_);
for (size_t i = 0; i < memory_.size(); ++i) {
if (memory_[i].second == MemoryStatusType::READY) { // is the object available?
reply->add_objectid(i);
}
}
/*
for (int i = 0; i < request->objectid_size(); ++i) {
ObjectID objectid = request->objectid(i);
Obj* obj = new Obj();
std::string data(memory_[objectid].ptr.data, memory_[objectid].ptr.len); // copies, but for debugging should be ok
obj->ParseFromString(data);
reply->mutable_obj()->AddAllocated(obj);
}
*/
return Status::OK;
}
Status ObjStoreService::StreamObjTo(ServerContext* context, const StreamObjToRequest* request, ServerWriter<ObjChunk>* writer) {
RAY_LOG(RAY_DEBUG, "begin to stream data from object store " << objstoreid_);
ObjChunk chunk;
ObjectID objectid = request->objectid();
memory_lock_.lock();
RAY_CHECK_LT(objectid, memory_.size(), "Objstore " << objstoreid_ << " is attempting to use objectid " << objectid << " in StreamObjTo, but this objectid is not present in the object store.");
RAY_CHECK_EQ(memory_[objectid].second, MemoryStatusType::READY, "Objstore " << objstoreid_ << " is attempting to stream objectid " << objectid << ", but memory_[objectid].second != MemoryStatusType::READY.");
ObjHandle handle = memory_[objectid].first;
memory_lock_.unlock(); // TODO(rkn): Make sure we don't still need to hold on to this lock.
segmentpool_lock_.lock();
const uint8_t* head = segmentpool_->get_address(handle);
segmentpool_lock_.unlock();
size_t size = handle.size();
for (size_t i = 0; i < size; i += CHUNK_SIZE) {
chunk.set_metadata_offset(handle.metadata_offset());
chunk.set_total_size(size);
chunk.set_data(head + i, std::min(CHUNK_SIZE, size - i));
RAY_CHECK(writer->Write(chunk), "stream connection prematurely closed")
}
return Status::OK;
}
Status ObjStoreService::NotifyAlias(ServerContext* context, const NotifyAliasRequest* request, AckReply* reply) {
// NotifyAlias assumes that the objstore already holds canonical_objectid
ObjectID alias_objectid = request->alias_objectid();
ObjectID canonical_objectid = request->canonical_objectid();
RAY_LOG(RAY_DEBUG, "Aliasing objectid " << alias_objectid << " with objectid " << canonical_objectid);
{
std::lock_guard<std::mutex> memory_lock(memory_lock_);
RAY_CHECK_LT(canonical_objectid, memory_.size(), "Attempting to alias objectid " << alias_objectid << " with objectid " << canonical_objectid << ", but objectid " << canonical_objectid << " is not in the objstore.")
RAY_CHECK_NEQ(memory_[canonical_objectid].second, MemoryStatusType::NOT_READY, "Attempting to alias objectid " << alias_objectid << " with objectid " << canonical_objectid << ", but objectid " << canonical_objectid << " is not ready yet in the objstore.")
RAY_CHECK_NEQ(memory_[canonical_objectid].second, MemoryStatusType::NOT_PRESENT, "Attempting to alias objectid " << alias_objectid << " with objectid " << canonical_objectid << ", but objectid " << canonical_objectid << " is not present in the objstore.")
RAY_CHECK_NEQ(memory_[canonical_objectid].second, MemoryStatusType::DEALLOCATED, "Attempting to alias objectid " << alias_objectid << " with objectid " << canonical_objectid << ", but objectid " << canonical_objectid << " has already been deallocated.")
if (alias_objectid >= memory_.size()) {
memory_.resize(alias_objectid + 1, std::make_pair(ObjHandle(), MemoryStatusType::NOT_PRESENT));
}
memory_[alias_objectid].first = memory_[canonical_objectid].first;
memory_[alias_objectid].second = MemoryStatusType::READY;
}
ObjRequest done_request;
done_request.type = ObjRequestType::ALIAS_DONE;
done_request.objectid = alias_objectid;
RAY_CHECK(recv_queue_.send(&done_request), "Failed to send message from the object store to itself because the message queue was full.");
return Status::OK;
}
Status ObjStoreService::DeallocateObject(ServerContext* context, const DeallocateObjectRequest* request, AckReply* reply) {
ObjectID canonical_objectid = request->canonical_objectid();
RAY_LOG(RAY_INFO, "Deallocating canonical_objectid " << canonical_objectid);
std::lock_guard<std::mutex> memory_lock(memory_lock_);
RAY_CHECK_EQ(memory_[canonical_objectid].second, MemoryStatusType::READY, "Attempting to deallocate canonical_objectid " << canonical_objectid << ", but memory_[canonical_objectid].second = " << memory_[canonical_objectid].second);
RAY_CHECK_LT(canonical_objectid, memory_.size(), "Attempting to deallocate canonical_objectid " << canonical_objectid << ", but it is not in the objstore.");
segmentpool_lock_.lock();
segmentpool_->deallocate(memory_[canonical_objectid].first);
segmentpool_lock_.unlock();
memory_[canonical_objectid].second = MemoryStatusType::DEALLOCATED;
return Status::OK;
}
// This table describes how the memory status changes in response to requests.
//
// MemoryStatus | ObjRequest | New MemoryStatus | action performed
// -------------+-------------+------------------+----------------------------
// NOT_PRESENT | ALLOC | NOT_READY | allocate object
// NOT_READY | WORKER_DONE | READY | send ObjReady to scheduler
// NOT_READY | GET | NOT_READY | add to get queue
// READY | GET | READY | return handle
// READY | DEALLOC | DEALLOCATED | deallocate
// -------------+-------------+------------------+----------------------------
void ObjStoreService::process_objstore_request(const ObjRequest request) {
switch (request.type) {
case ObjRequestType::ALIAS_DONE: {
process_gets_for_objectid(request.objectid);
}
break;
default: {
RAY_CHECK(false, "Attempting to process request of type " << request.type << ". This code should be unreachable.");
}
}
}
void ObjStoreService::process_worker_request(const ObjRequest request) {
if (request.workerid >= send_queues_.size()) {
send_queues_.resize(request.workerid + 1);
}
if (!send_queues_[request.workerid].connected()) {
std::string queue_name = std::string("queue:") + objstore_address_ + std::string(":worker:") + std::to_string(request.workerid) + std::string(":obj");
RAY_CHECK(send_queues_[request.workerid].connect(queue_name, false), "error connecting receive_queue_");
}
{
std::lock_guard<std::mutex> memory_lock(memory_lock_);
if (request.objectid >= memory_.size()) {
memory_.resize(request.objectid + 1, std::make_pair(ObjHandle(), MemoryStatusType::NOT_PRESENT));
}
}
switch (request.type) {
case ObjRequestType::ALLOC: {
ObjHandle handle = alloc(request.objectid, request.size); // This method acquires memory_lock_
RAY_CHECK(send_queues_[request.workerid].send(&handle), "Failed to send message from the object store to the worker with id " << request.workerid << " because the message queue was full.");
}
break;
case ObjRequestType::GET: {
std::lock_guard<std::mutex> memory_lock(memory_lock_);
std::pair<ObjHandle, MemoryStatusType>& item = memory_[request.objectid];
if (item.second == MemoryStatusType::READY) {
RAY_LOG(RAY_DEBUG, "Responding to GET request: returning objectid " << request.objectid);
RAY_CHECK(send_queues_[request.workerid].send(&item.first), "Failed to send message from the object store to the worker with id " << request.workerid << " because the message queue was full.");
} else if (item.second == MemoryStatusType::NOT_READY || item.second == MemoryStatusType::NOT_PRESENT || item.second == MemoryStatusType::PRE_ALLOCED) {
std::lock_guard<std::mutex> lock(get_queue_lock_);
get_queue_.push_back(std::make_pair(request.workerid, request.objectid));
} else {
RAY_CHECK(false, "A worker requested objectid " << request.objectid << ", but memory_[objectid].second = " << memory_[request.objectid].second);
}
}
break;
case ObjRequestType::WORKER_DONE: {
object_ready(request.objectid, request.metadata_offset); // This method acquires memory_lock_
}
break;
default: {
RAY_CHECK(false, "Attempting to process request of type " << request.type << ". This code should be unreachable.");
}
}
}
void ObjStoreService::process_requests() {
// TODO(rkn): Should memory_lock_ be used in this method?
ObjRequest request;
while (true) {
RAY_CHECK(recv_queue_.receive(&request), "error receiving over IPC");
switch (request.type) {
case ObjRequestType::ALLOC: {
RAY_LOG(RAY_VERBOSE, "Request (worker " << request.workerid << " to objstore " << objstoreid_ << "): Allocate object with objectid " << request.objectid << " and size " << request.size);
process_worker_request(request);
}
break;
case ObjRequestType::GET: {
RAY_LOG(RAY_VERBOSE, "Request (worker " << request.workerid << " to objstore " << objstoreid_ << "): Get object with objectid " << request.objectid);
process_worker_request(request);
}
break;
case ObjRequestType::WORKER_DONE: {
RAY_LOG(RAY_VERBOSE, "Request (worker " << request.workerid << " to objstore " << objstoreid_ << "): Finalize object with objectid " << request.objectid);
process_worker_request(request);
}
break;
case ObjRequestType::ALIAS_DONE: {
process_objstore_request(request);
}
break;
default: {
RAY_CHECK(false, "Attempting to process request of type " << request.type << ". This code should be unreachable.");
}
}
}
}
void ObjStoreService::process_gets_for_objectid(ObjectID objectid) {
std::pair<ObjHandle, MemoryStatusType>& item = memory_[objectid];
std::lock_guard<std::mutex> get_queue_lock(get_queue_lock_);
for (size_t i = 0; i < get_queue_.size(); ++i) {
if (get_queue_[i].second == objectid) {
ObjHandle& elem = memory_[objectid].first;
RAY_CHECK(send_queues_[get_queue_[i].first].send(&item.first), "Failed to send message from the object store to the worker with id " << get_queue_[i].first << " because the message queue was full.");
// Remove the get task from the queue
std::swap(get_queue_[i], get_queue_[get_queue_.size() - 1]);
get_queue_.pop_back();
i -= 1;
}
}
}
ObjHandle ObjStoreService::alloc(ObjectID objectid, size_t size) {
segmentpool_lock_.lock();
ObjHandle handle = segmentpool_->allocate(size);
segmentpool_lock_.unlock();
std::lock_guard<std::mutex> memory_lock(memory_lock_);
RAY_LOG(RAY_VERBOSE, "Allocating space for objectid " << objectid << " on object store " << objstoreid_);
RAY_CHECK(memory_[objectid].second == MemoryStatusType::NOT_PRESENT || memory_[objectid].second == MemoryStatusType::PRE_ALLOCED, "Attempting to allocate space for objectid " << objectid << ", but memory_[objectid].second = " << memory_[objectid].second);
memory_[objectid].first = handle;
memory_[objectid].second = MemoryStatusType::NOT_READY;
return handle;
}
void ObjStoreService::object_ready(ObjectID objectid, size_t metadata_offset) {
{
RAY_LOG(RAY_INFO, "Object with ObjectID " << objectid << " is ready.");
std::lock_guard<std::mutex> memory_lock(memory_lock_);
std::pair<ObjHandle, MemoryStatusType>& item = memory_[objectid];
RAY_CHECK_EQ(item.second, MemoryStatusType::NOT_READY, "A worker notified the object store that objectid " << objectid << " has been written to the object store, but memory_[objectid].second != NOT_READY.");
item.first.set_metadata_offset(metadata_offset);
item.second = MemoryStatusType::READY;
}
process_gets_for_objectid(objectid);
// Tell the scheduler that the object arrived
// TODO(pcm): put this in a separate thread so we don't have to pay the latency here
ClientContext objready_context;
ObjReadyRequest objready_request;
objready_request.set_objectid(objectid);
objready_request.set_objstoreid(objstoreid_);
AckReply objready_reply;
RAY_CHECK_GRPC(scheduler_stub_->ObjReady(&objready_context, objready_request, &objready_reply));
}
void ObjStoreService::start_objstore_service() {
communicator_thread_ = std::thread([this]() {
RAY_LOG(RAY_INFO, "started object store communicator server");
process_requests();
});
}
void start_objstore(const char* scheduler_addr, const char* node_ip_address) {
RAY_LOG(RAY_INFO, "Starting an object store on node " << std::string(node_ip_address));
auto scheduler_channel = grpc::CreateChannel(scheduler_addr, grpc::InsecureChannelCredentials());
RAY_LOG(RAY_INFO, "Object store connected to scheduler " << scheduler_addr);
ObjStoreService service(scheduler_channel);
ServerBuilder builder;
// Get GRPC to assign an unused port.
int port;
builder.AddListeningPort(std::string("0.0.0.0:0"), grpc::InsecureServerCredentials(), &port);
builder.RegisterService(&service);
std::unique_ptr<Server> server(builder.BuildAndStart());
if (server == nullptr) {
RAY_CHECK(false, "Failed to create the object store service.");
}
std::string objstore_address = std::string(node_ip_address) + ":" + std::to_string(port);
RAY_LOG(RAY_INFO, "This object store has address " << objstore_address);
std::string recv_queue_name = std::string("queue:") + objstore_address + std::string(":obj");
service.register_objstore(objstore_address, recv_queue_name);
service.start_objstore_service();
// Process incoming GRPC calls. These may come from the scheduler or from
// other object stores. This method does not return.
server->Wait();
}
RayConfig global_ray_config;
int main(int argc, char** argv) {
RAY_CHECK_GE(argc, 3, "object store: expected at least two arguments (scheduler ip address and object store ip address)");
if (argc > 3) {
const char* log_file_name = get_cmd_option(argv, argv + argc, "--log-file-name");
if (log_file_name) {
std::cout << "object store: writing to log file " << log_file_name << std::endl;
create_log_dir_or_die(log_file_name);
global_ray_config.log_to_file = true;
global_ray_config.logfile.open(log_file_name);
} else {
std::cout << "object store: writing logs to stdout; you can change this by passing --log-file-name <filename> to ./scheduler" << std::endl;
global_ray_config.log_to_file = false;
}
}
start_objstore(argv[1], argv[2]);
return 0;
}
-81
View File
@@ -1,81 +0,0 @@
#ifndef RAY_OBJSTORE_H
#define RAY_OBJSTORE_H
#include <unordered_map>
#include <memory>
#include <thread>
#include <iostream>
#include <grpc++/grpc++.h>
#include "ray/ray.h"
#include "ray.grpc.pb.h"
#include "types.pb.h"
#include "ipc.h"
using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerReader;
using grpc::ServerContext;
using grpc::ClientContext;
using grpc::ServerWriter;
using grpc::ClientReader;
using grpc::Status;
using grpc::Channel;
// READY: This is used to indicate that the object has been copied from a
// worker and is ready to be used.
// NOT_READY: This is used to indicate that memory has been allocated for the
// object, but the object hasn't been copied from a worker yet.
// DEALLOCATED: This is used to indicate that the object has been deallocated.
// NOT_PRESENT: This is used to indicate that space has not been allocated for
// this object in this object store.
// PRE_ALLOCED: This is used to indicate that the memory has not yet been
// alloced, but it will be alloced soon. This is set when we call
// StartDelivery.
enum MemoryStatusType {READY = 0, NOT_READY = 1, DEALLOCATED = 2, NOT_PRESENT = 3, PRE_ALLOCED = 4};
class ObjStoreService final : public ObjStore::Service {
public:
ObjStoreService(std::shared_ptr<Channel> scheduler_channel);
Status StartDelivery(ServerContext* context, const StartDeliveryRequest* request, AckReply* reply) override;
Status StreamObjTo(ServerContext* context, const StreamObjToRequest* request, ServerWriter<ObjChunk>* writer) override;
Status NotifyAlias(ServerContext* context, const NotifyAliasRequest* request, AckReply* reply) override;
Status DeallocateObject(ServerContext* context, const DeallocateObjectRequest* request, AckReply* reply) override;
Status ObjStoreInfo(ServerContext* context, const ObjStoreInfoRequest* request, ObjStoreInfoReply* reply) override;
void start_objstore_service();
void register_objstore(const std::string& objstore_address, const std::string& recv_queue_name);
private:
void get_data_from(ObjectID objectid, ObjStore::Stub& stub);
// check if we already connected to the other objstore, if yes, return reference to connection, otherwise connect
ObjStore::Stub& get_objstore_stub(const std::string& objstore_address);
void process_worker_request(const ObjRequest request);
void process_objstore_request(const ObjRequest request);
void process_requests();
void process_gets_for_objectid(ObjectID objectid);
ObjHandle alloc(ObjectID objectid, size_t size);
void object_ready(ObjectID objectid, size_t metadata_offset);
static const size_t CHUNK_SIZE;
std::string objstore_address_;
ObjStoreId objstoreid_; // id of this objectstore in the scheduler object store table
std::shared_ptr<MemorySegmentPool> segmentpool_;
std::mutex segmentpool_lock_;
std::vector<std::pair<ObjHandle, MemoryStatusType> > memory_; // object ID -> (memory address, memory status)
std::mutex memory_lock_;
std::unordered_map<std::string, std::unique_ptr<ObjStore::Stub>> objstores_;
std::mutex objstores_lock_;
std::unique_ptr<Scheduler::Stub> scheduler_stub_;
std::vector<std::pair<WorkerId, ObjectID> > get_queue_;
std::mutex get_queue_lock_;
MessageQueue<ObjRequest> recv_queue_; // This queue is used by workers to send tasks to the object store.
std::vector<MessageQueue<ObjHandle> > send_queues_; // This maps workerid -> queue. The object store uses these queues to send replies to the relevant workers.
std::thread communicator_thread_;
std::vector<std::shared_ptr<std::thread> > delivery_threads_; // TODO(rkn): document
// TODO(rkn): possibly add lock, and properly remove these threads from the delivery_threads_ when the deliveries are done
};
#endif
+22
View File
@@ -0,0 +1,22 @@
CC = gcc
CFLAGS = -g -Wall --std=c99 -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L -Icommon -Icommon/thirdparty -fPIC
BUILD = build
all: $(BUILD)/photon_scheduler $(BUILD)/photon_client.a
$(BUILD)/photon_client.a: photon_client.o
ar rcs $(BUILD)/photon_client.a photon_client.o
$(BUILD)/photon_scheduler: photon.h photon_scheduler.c photon_algorithm.c common
$(CC) $(CFLAGS) -o $@ photon_scheduler.c photon_algorithm.c common/build/libcommon.a common/thirdparty/hiredis/libhiredis.a -Icommon/thirdparty/ -Icommon/ ../plasma/build/libplasma_client.a -I../plasma/src/
common: FORCE
git submodule update --init --recursive
cd common; make
clean:
cd common; make clean
rm -r $(BUILD)/*
rm *.o
FORCE:
View File
+140
View File
@@ -0,0 +1,140 @@
#include <Python.h>
#include "common_extension.h"
#include "photon_client.h"
#include "task.h"
PyObject *PhotonError;
// clang-format off
typedef struct {
PyObject_HEAD
photon_conn *photon_connection;
} PyPhotonClient;
// clang-format on
static int PyPhotonClient_init(PyPhotonClient *self, PyObject *args,
PyObject *kwds) {
char *socket_name;
if (!PyArg_ParseTuple(args, "s", &socket_name)) {
return -1;
}
self->photon_connection = photon_connect(socket_name);
return 0;
}
static void PyPhotonClient_dealloc(PyPhotonClient *self) {
free(((PyPhotonClient *)self)->photon_connection);
Py_TYPE(self)->tp_free((PyObject *)self);
}
static PyObject *PyPhotonClient_submit(PyObject *self, PyObject *args) {
PyObject *py_task;
if (!PyArg_ParseTuple(args, "O", &py_task)) {
return NULL;
}
photon_submit(((PyPhotonClient *)self)->photon_connection,
((PyTask *)py_task)->spec);
Py_RETURN_NONE;
}
// clang-format off
static PyObject *PyPhotonClient_get_task(PyObject *self) {
task_spec *task_spec;
/* Drop the global interpreter lock while we get a task because
* photon_get_task may block for a long time. */
Py_BEGIN_ALLOW_THREADS
task_spec = photon_get_task(((PyPhotonClient *)self)->photon_connection);
Py_END_ALLOW_THREADS
return PyTask_make(task_spec);
}
// clang-format on
static PyMethodDef PyPhotonClient_methods[] = {
{"submit", (PyCFunction)PyPhotonClient_submit, METH_VARARGS,
"Submit a task to the local scheduler."},
{"get_task", (PyCFunction)PyPhotonClient_get_task, METH_NOARGS,
"Get a task from the local scheduler."},
{NULL} /* Sentinel */
};
static PyTypeObject PyPhotonClientType = {
PyObject_HEAD_INIT(NULL) 0, /* ob_size */
"photon.PhotonClient", /* tp_name */
sizeof(PyPhotonClient), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)PyPhotonClient_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"PhotonClient object", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
PyPhotonClient_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)PyPhotonClient_init, /* tp_init */
0, /* tp_alloc */
PyType_GenericNew, /* tp_new */
};
static PyMethodDef photon_methods[] = {
{"check_simple_value", check_simple_value, METH_VARARGS,
"Should the object be passed by value?"},
{NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
PyMODINIT_FUNC initphoton(void) {
PyObject *m;
if (PyType_Ready(&PyTaskType) < 0)
return;
if (PyType_Ready(&PyObjectIDType) < 0)
return;
if (PyType_Ready(&PyPhotonClientType) < 0)
return;
m = Py_InitModule3("photon", photon_methods,
"A module for the local scheduler.");
Py_INCREF(&PyTaskType);
PyModule_AddObject(m, "Task", (PyObject *)&PyTaskType);
Py_INCREF(&PyObjectIDType);
PyModule_AddObject(m, "ObjectID", (PyObject *)&PyObjectIDType);
Py_INCREF(&PyPhotonClientType);
PyModule_AddObject(m, "PhotonClient", (PyObject *)&PyPhotonClientType);
char photon_error[] = "photon.error";
PhotonError = PyErr_NewException(photon_error, NULL, NULL);
Py_INCREF(PhotonError);
PyModule_AddObject(m, "photon_error", PhotonError);
}
+14
View File
@@ -0,0 +1,14 @@
from setuptools import setup, find_packages, Extension
photon_module = Extension("photon",
sources=["photon_extension.c", "../../common/lib/python/common_extension.c"],
include_dirs=["../../", "../../common/",
"../../common/thirdparty/",
"../../common/lib/python"],
extra_objects=["../../build/photon_client.a", "../../common/build/libcommon.a"],
extra_compile_args=["--std=c99", "-Werror"])
setup(name="Photon",
version="0.1",
description="Photon library for Ray",
ext_modules=[photon_module])
+40
View File
@@ -0,0 +1,40 @@
#ifndef PHOTON_H
#define PHOTON_H
#include "common/task.h"
#include "common/state/db.h"
#include "utarray.h"
#include "uthash.h"
enum photon_message_type {
/** Notify the local scheduler that a task has finished. */
TASK_DONE = 64,
/** Get a new task from the local scheduler. */
GET_TASK,
/** This is sent from the local scheduler to a worker to tell the worker to
* execute a task. */
EXECUTE_TASK,
};
// clang-format off
/** Contains all information that is associated to a worker. */
typedef struct {
int sock;
} worker;
// clang-format on
/* These are needed to define the UT_arrays. */
UT_icd task_ptr_icd;
UT_icd worker_icd;
/** Resources that are exposed to the scheduling algorithm. */
typedef struct {
/** List of workers available to this node. The index into this array
* is the worker_index and is used to identify workers throughout
* the program. */
UT_array *workers;
/* The handle to the database. */
db_handle *db;
} scheduler_info;
#endif /* PHOTON_H */
+184
View File
@@ -0,0 +1,184 @@
#include "photon_algorithm.h"
#include <stdbool.h>
#include "utarray.h"
#include "state/task_log.h"
#include "photon.h"
#include "photon_scheduler.h"
typedef struct {
/* Object id of this object. */
object_id object_id;
/* Handle for the uthash table. */
UT_hash_handle handle;
} available_object;
/** Part of the photon state that is maintained by the scheduling algorithm. */
struct scheduler_state {
/** An array of pointers to tasks that are waiting to be scheduled. */
UT_array *task_queue;
/** An array of worker indices corresponding to clients that are
* waiting for tasks. */
UT_array *available_workers;
/** A hash map of the objects that are available in the local Plasma store.
* This information could be a little stale. */
available_object *local_objects;
};
scheduler_state *make_scheduler_state(void) {
scheduler_state *state = malloc(sizeof(scheduler_state));
/* Initialize an empty hash map for the cache of local available objects. */
state->local_objects = NULL;
/* Initialize the local data structures used for queuing tasks and workers. */
utarray_new(state->task_queue, &task_ptr_icd);
utarray_new(state->available_workers, &ut_int_icd);
return state;
}
void free_scheduler_state(scheduler_state *s) {
utarray_free(s->task_queue);
utarray_free(s->available_workers);
free(s);
}
/**
* Check if all of the remote object arguments for a task are available in the
* local object store.
*
* @param s The scheduler state.
* @param task Task specification of the task to check.
* @return This returns 1 if all of the remote object arguments for the task are
* present in the local object store, otherwise it returns 0.
*/
bool can_run(scheduler_state *s, task_spec *task) {
int64_t num_args = task_num_args(task);
for (int i = 0; i < num_args; ++i) {
if (task_arg_type(task, i) == ARG_BY_REF) {
object_id obj_id = *task_arg_id(task, i);
available_object *entry;
HASH_FIND(handle, s->local_objects, &obj_id, sizeof(object_id), entry);
if (entry == NULL) {
/* The object is not present locally, so this task cannot be scheduled
* right now. */
return false;
}
}
}
return true;
}
/**
* If there is a task whose dependencies are available locally, assign it to the
* worker. This does not remove the worker from the available worker queue.
*
* @param s The scheduler state.
* @param worker_index The index of the worker.
* @return This returns 1 if it successfully assigned a task to the worker,
* otherwise it returns 0.
*/
int find_and_schedule_task_if_possible(scheduler_info *info,
scheduler_state *state,
int worker_index) {
int found_task_to_schedule = 0;
/* Find the first task whose dependencies are available locally. */
task_spec *spec;
task_instance **task;
int i = 0;
for (; i < utarray_len(state->task_queue); ++i) {
task = (task_instance **) utarray_eltptr(state->task_queue, i);
spec = task_instance_task_spec(*task);
if (can_run(state, spec)) {
found_task_to_schedule = 1;
break;
}
}
if (found_task_to_schedule) {
/* This task's dependencies are available locally, so assign the task to the
* worker. */
assign_task_to_worker(info, spec, worker_index);
/* Update the task queue data structure and free the task. */
free(*task);
utarray_erase(state->task_queue, i, 1);
}
return found_task_to_schedule;
}
void handle_task_submitted(scheduler_info *info,
scheduler_state *s,
task_spec *task) {
/* Create a unique task instance ID. This is different from the task ID and
* is used to distinguish between potentially multiple executions of the
* task. */
task_iid task_iid = globally_unique_id();
task_instance *instance =
make_task_instance(task_iid, task, TASK_STATUS_WAITING, NIL_ID);
/* If this task's dependencies are available locally, and if there is an
* available worker, then assign this task to an available worker. Otherwise,
* add this task to the local task queue. */
int schedule_locally =
(utarray_len(s->available_workers) > 0) && can_run(s, task);
if (schedule_locally) {
/* Get the last available worker in the available worker queue. */
int *worker_index = (int *) utarray_back(s->available_workers);
/* Tell the available worker to execute the task. */
assign_task_to_worker(info, task, *worker_index);
/* Remove the available worker from the queue and free the struct. */
utarray_pop_back(s->available_workers);
} else {
/* Add the task to the task queue. This passes ownership of the task queue.
* And the task will be freed when it is assigned to a worker. */
utarray_push_back(s->task_queue, &instance);
}
/* Submit the task to redis. */
task_log_add_task(info->db, instance);
if (schedule_locally) {
/* If the task was scheduled locally, we need to free it. Otherwise,
* ownership of the task is passed to the task_queue, and it will be freed
* when it is assigned to a worker. */
free(instance);
}
}
void handle_worker_available(scheduler_info *info,
scheduler_state *state,
int worker_index) {
int scheduled_task =
find_and_schedule_task_if_possible(info, state, worker_index);
/* If we couldn't find a task to schedule, add the worker to the queue of
* available workers. */
if (!scheduled_task) {
for (int *p = (int *) utarray_front(state->available_workers); p != NULL;
p = (int *) utarray_next(state->available_workers, p)) {
CHECK(*p != worker_index);
}
/* Add client_sock to a list of available workers. This struct will be freed
* when a task is assigned to this worker. */
utarray_push_back(state->available_workers, &worker_index);
LOG_INFO("Adding worker_index %d to available workers.\n", worker_index);
}
}
void handle_object_available(scheduler_info *info,
scheduler_state *state,
object_id object_id) {
/* TODO(rkn): When does this get freed? */
available_object *entry =
(available_object *) malloc(sizeof(available_object));
entry->object_id = object_id;
HASH_ADD(handle, state->local_objects, object_id, sizeof(object_id), entry);
/* Check if we can schedule any tasks. */
int num_tasks_scheduled = 0;
for (int *p = (int *) utarray_front(state->available_workers); p != NULL;
p = (int *) utarray_next(state->available_workers, p)) {
/* Schedule a task on this worker if possible. */
int scheduled_task = find_and_schedule_task_if_possible(info, state, *p);
if (!scheduled_task) {
/* There are no tasks we can schedule, so exit the loop. */
break;
}
num_tasks_scheduled += 1;
}
utarray_erase(state->available_workers, 0, num_tasks_scheduled);
}
+88
View File
@@ -0,0 +1,88 @@
#ifndef PHOTON_ALGORITHM_H
#define PHOTON_ALGORITHM_H
#include "photon.h"
#include "common/task.h"
/* ==== The scheduling algorithm ====
*
* This file contains declaration for all functions and data structures
* that need to be provided if you want to implement a new algorithms
* for the local scheduler.
*
*/
/** Internal state of the scheduling algorithm. */
typedef struct scheduler_state scheduler_state;
/**
* Initialize the scheduler state.
*
* @return Internal state of the scheduling algorithm.
*/
scheduler_state *make_scheduler_state(void);
/**
* Free the scheduler state.
*
* @param state Internal state of the scheduling algorithm.
* @return Void.
*/
void free_scheduler_state(scheduler_state *state);
/**
* This function will be called when a new task is submitted by a worker for
* execution.
*
* @param info Info about resources exposed by photon to the scheduling
* algorithm.
* @param state State of the scheduling algorithm.
* @param task Task that is submitted by the worker.
* @return Void.
*/
void handle_task_submitted(scheduler_info *info,
scheduler_state *state,
task_spec *task);
/**
* This function will be called when a task is assigned by the global scheduler
* for execution on this local scheduler.
*
* @param info Info about resources exposed by photon to the scheduling
* algorithm.
* @param state State of the scheduling algorithm.
* @param task Task that is assigned by the global scheduler.
* @return Void.
*/
void handle_task_assigned(scheduler_info *info,
scheduler_state *state,
task_spec *task);
/**
* This function is called if a new object becomes available in the local
* plasma store.
*
* @param info Info about resources exposed by photon to the scheduling
* algorithm.
* @param state State of the scheduling algorithm.
* @param object_id ID of the object that became available.
* @return Void.
*/
void handle_object_available(scheduler_info *info,
scheduler_state *state,
object_id object_id);
/**
* This function is called when a new worker becomes available
*
* @param info Info about resources exposed by photon to the scheduling
* algorithm.
* @param state State of the scheduling algorithm.
* @param worker_index The index of the worker that becomes available.
* @return Void.
*/
void handle_worker_available(scheduler_info *info,
scheduler_state *state,
int worker_index);
#endif /* PHOTON_ALGORITHM_H */
+41
View File
@@ -0,0 +1,41 @@
#include "photon_client.h"
#include "common/io.h"
#include "common/task.h"
#include <stdlib.h>
photon_conn *photon_connect(const char *photon_socket) {
photon_conn *result = malloc(sizeof(photon_conn));
result->conn = connect_ipc_sock(photon_socket);
return result;
}
void photon_submit(photon_conn *conn, task_spec *task) {
write_message(conn->conn, SUBMIT_TASK, task_size(task), (uint8_t *)task);
}
task_spec *photon_get_task(photon_conn *conn) {
write_message(conn->conn, GET_TASK, 0, NULL);
int64_t type;
int64_t length;
uint8_t *message;
/* Receive a task from the local scheduler. This will block until the local
* scheduler gives this client a task. */
read_message(conn->conn, &type, &length, &message);
CHECK(type == EXECUTE_TASK);
task_spec *task = (task_spec *)message;
CHECK(length == task_size(task));
return task;
}
void photon_task_done(photon_conn *conn) {
write_message(conn->conn, TASK_DONE, 0, NULL);
}
void photon_disconnect(photon_conn *conn) {
write_message(conn->conn, DISCONNECT_CLIENT, 0, NULL);
}
void photon_log_message(photon_conn *conn) {
write_message(conn->conn, LOG_MESSAGE, 0, NULL);
}
+66
View File
@@ -0,0 +1,66 @@
#ifndef PHOTON_CLIENT_H
#define PHOTON_CLIENT_H
#include "common/task.h"
#include "photon.h"
typedef struct {
/* File descriptor of the Unix domain socket that connects to photon. */
int conn;
} photon_conn;
/**
* Connect to the local scheduler.
*
* @param photon_socket The name of the socket to use to connect to the local
scheduler.
* @return The connection information.
*/
photon_conn *photon_connect(const char *photon_socket);
/**
* Submit a task to the local scheduler.
*
* @param conn The connection information.
* @param task The address of the task to submit.
* @return Void.
*/
void photon_submit(photon_conn *conn, task_spec *task);
/**
* Get next task for this client. This will block until the scheduler assigns
* a task to this worker. This allocates and returns a task, and so the task
* must be freed by the caller.
*
* @todo When does this actually get freed?
*
* @param conn The connection information.
* @return The address of the assigned task.
*/
task_spec *photon_get_task(photon_conn *conn);
/**
* Tell the local scheduler that the client has finished executing a task.
*
* @param conn The connection information.
* @return Void.
*/
void photon_task_done(photon_conn *conn);
/**
* Disconnect from the local scheduler.
*
* @param conn The connection information.
* @return Void.
*/
void photon_disconnect(photon_conn *conn);
/**
* Send a log message to the local scheduler.
*
* @param conn The connection information.
* @return Void.
*/
void photon_log_message(photon_conn *conn);
#endif
+228
View File
@@ -0,0 +1,228 @@
#include <inttypes.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>
#include "common.h"
#include "event_loop.h"
#include "io.h"
#include "photon.h"
#include "photon_algorithm.h"
#include "photon_scheduler.h"
#include "plasma_client.h"
#include "state/db.h"
#include "state/task_log.h"
#include "utarray.h"
#include "uthash.h"
UT_icd task_ptr_icd = {sizeof(task_instance *), NULL, NULL, NULL};
UT_icd worker_icd = {sizeof(worker), NULL, NULL, NULL};
/** Association between the socket fd of a worker and its worker_index. */
typedef struct {
/** The socket fd of a worker. */
int sock;
/** The index of the worker in scheduler_info->workers. */
int64_t worker_index;
/** Handle for the hash table. */
UT_hash_handle hh;
} worker_index;
struct local_scheduler_state {
/* The local scheduler event loop. */
event_loop *loop;
/* The Plasma client. */
plasma_store_conn *plasma_conn;
/* Association between client socket and worker index. */
worker_index *worker_index;
/* Info that is exposed to the scheduling algorithm. */
scheduler_info *scheduler_info;
/* State for the scheduling algorithm. */
scheduler_state *scheduler_state;
};
local_scheduler_state *init_local_scheduler(event_loop *loop,
const char *redis_addr,
int redis_port,
const char *plasma_socket_name) {
local_scheduler_state *state = malloc(sizeof(local_scheduler_state));
state->loop = loop;
/* Connect to Plasma. This method will retry if Plasma hasn't started yet. */
state->plasma_conn = plasma_store_connect(plasma_socket_name);
/* Subscribe to notifications about sealed objects. */
int plasma_fd = plasma_subscribe(state->plasma_conn);
/* Add the callback that processes the notification to the event loop. */
event_loop_add_file(loop, plasma_fd, EVENT_LOOP_READ,
process_plasma_notification, state);
state->worker_index = NULL;
/* Add scheduler info. */
state->scheduler_info = malloc(sizeof(scheduler_info));
utarray_new(state->scheduler_info->workers, &worker_icd);
/* Connect to Redis. */
state->scheduler_info->db =
db_connect(redis_addr, redis_port, "photon", "", -1);
db_attach(state->scheduler_info->db, loop);
/* Add scheduler state. */
state->scheduler_state = make_scheduler_state();
return state;
};
void free_local_scheduler(local_scheduler_state *s) {
db_disconnect(s->scheduler_info->db);
free(s->scheduler_info);
free_scheduler_state(s->scheduler_state);
event_loop_destroy(s->loop);
free(s);
}
void assign_task_to_worker(scheduler_info *info,
task_spec *task,
int worker_index) {
CHECK(worker_index < utarray_len(info->workers));
worker *w = (worker *) utarray_eltptr(info->workers, worker_index);
write_message(w->sock, EXECUTE_TASK, task_size(task), (uint8_t *) task);
}
void process_plasma_notification(event_loop *loop,
int client_sock,
void *context,
int events) {
local_scheduler_state *s = context;
/* Read the notification from Plasma. */
uint8_t *message = (uint8_t *) malloc(sizeof(object_id));
recv(client_sock, message, sizeof(object_id), 0);
object_id *obj_id = (object_id *) message;
handle_object_available(s->scheduler_info, s->scheduler_state, *obj_id);
}
void process_message(event_loop *loop, int client_sock, void *context,
int events) {
local_scheduler_state *s = context;
uint8_t *message;
int64_t type;
int64_t length;
read_message(client_sock, &type, &length, &message);
LOG_DEBUG("New event of type %" PRId64, type);
switch (type) {
case SUBMIT_TASK: {
task_spec *spec = (task_spec *) message;
CHECK(task_size(spec) == length);
handle_task_submitted(s->scheduler_info, s->scheduler_state, spec);
} break;
case TASK_DONE: {
} break;
case GET_TASK: {
worker_index *wi;
HASH_FIND_INT(s->worker_index, &client_sock, wi);
printf("worker_index is %" PRId64 "\n", wi->worker_index);
handle_worker_available(s->scheduler_info, s->scheduler_state,
wi->worker_index);
} break;
case DISCONNECT_CLIENT: {
LOG_INFO("Disconnecting client on fd %d", client_sock);
event_loop_remove_file(loop, client_sock);
} break;
case LOG_MESSAGE: {
} break;
default:
/* This code should be unreachable. */
CHECK(0);
}
free(message);
}
void new_client_connection(event_loop *loop, int listener_sock, void *context,
int events) {
local_scheduler_state *s = context;
int new_socket = accept_client(listener_sock);
event_loop_add_file(loop, new_socket, EVENT_LOOP_READ, process_message, s);
LOG_INFO("new connection with fd %d", new_socket);
/* Add worker to list of workers. */
/* TODO(pcm): Where shall we free this? */
worker_index *new_worker_index = malloc(sizeof(worker_index));
new_worker_index->sock = new_socket;
new_worker_index->worker_index = utarray_len(s->scheduler_info->workers);
HASH_ADD_INT(s->worker_index, sock, new_worker_index);
worker worker = {.sock = new_socket};
utarray_push_back(s->scheduler_info->workers, &worker);
}
/* We need this code so we can clean up when we get a SIGTERM signal. */
local_scheduler_state *g_state;
void signal_handler(int signal) {
if (signal == SIGTERM) {
free_local_scheduler(g_state);
exit(0);
}
}
/* End of the cleanup code. */
void start_server(const char *socket_name,
const char *redis_addr,
int redis_port,
const char *plasma_socket_name) {
int fd = bind_ipc_sock(socket_name);
event_loop *loop = event_loop_create();
g_state =
init_local_scheduler(loop, redis_addr, redis_port, plasma_socket_name);
/* Run event loop. */
event_loop_add_file(loop, fd, EVENT_LOOP_READ, new_client_connection,
g_state);
event_loop_run(loop);
}
int main(int argc, char *argv[]) {
signal(SIGTERM, signal_handler);
/* Path of the listening socket of the local scheduler. */
char *scheduler_socket_name = NULL;
/* IP address and port of redis. */
char *redis_addr_port = NULL;
/* Socket name for the local Plasma store. */
char *plasma_socket_name = NULL;
int c;
while ((c = getopt(argc, argv, "s:r:p:")) != -1) {
switch (c) {
case 's':
scheduler_socket_name = optarg;
break;
case 'r':
redis_addr_port = optarg;
break;
case 'p':
plasma_socket_name = optarg;
break;
default:
LOG_ERR("unknown option %c", c);
exit(-1);
}
}
if (!scheduler_socket_name) {
LOG_ERR("please specify socket for incoming connections with -s switch");
exit(-1);
}
if (!plasma_socket_name) {
LOG_ERR("please specify socket for connecting to Plasma with -p switch");
exit(-1);
}
/* Parse the Redis address into an IP address and a port. */
char redis_addr[16] = {0};
char redis_port[6] = {0};
if (!redis_addr_port ||
sscanf(redis_addr_port, "%15[0-9.]:%5[0-9]", redis_addr, redis_port) !=
2) {
LOG_ERR("need to specify redis address like 127.0.0.1:6379 with -r switch");
exit(-1);
}
start_server(scheduler_socket_name, &redis_addr[0], atoi(redis_port),
plasma_socket_name);
}
+52
View File
@@ -0,0 +1,52 @@
#ifndef PHOTON_SCHEDULER_H
#define PHOTON_SCHEDULER_H
#include "task.h"
#include "event_loop.h"
typedef struct local_scheduler_state local_scheduler_state;
/**
* Establish a connection to a new client.
*
* @param loop Event loop of the local scheduler.
* @param listener_socket Socket the local scheduler is listening on for new
* client requests.
* @param context State of the local scheduler.
* @param events Flag for events that are available on the listener socket.
* @return Void.
*/
void new_client_connection(event_loop *loop,
int listener_sock,
void *context,
int events);
/**
* This function can be called by the scheduling algorithm to assign a task
* to a worker.
*
* @param info
* @param task The task that is submitted to the worker.
* @param worker_index The index of the worker the task is submitted to.
* @return Void.
*/
void assign_task_to_worker(scheduler_info *info,
task_spec *task,
int worker_index);
/**
* This is the callback that is used to process a notification from the Plasma
* store that an object has been sealed.
*
* @param loop The local scheduler's event loop.
* @param client_sock The file descriptor to read the notification from.
* @param context The local scheduler state.
* @param events
* @return Void.
*/
void process_plasma_notification(event_loop *loop,
int client_sock,
void *context,
int events);
#endif /* PHOTON_SCHEDULER_H */
+151
View File
@@ -0,0 +1,151 @@
from __future__ import print_function
import os
import signal
import subprocess
import sys
import unittest
import random
import threading
import time
import photon
import plasma
USE_VALGRIND = False
class TestPhotonClient(unittest.TestCase):
def setUp(self):
# Start Redis.
redis_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../common/thirdparty/redis-3.2.3/src/redis-server")
self.p1 = subprocess.Popen([redis_executable, "--loglevel", "warning"])
# Start Plasma.
plasma_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../plasma/build/plasma_store")
plasma_socket = "/tmp/plasma_store{}".format(random.randint(0, 10000))
self.p2 = subprocess.Popen([plasma_executable, "-s", plasma_socket])
time.sleep(0.1)
self.plasma_client = plasma.PlasmaClient(plasma_socket)
scheduler_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../build/photon_scheduler")
scheduler_name = "/tmp/scheduler{}".format(random.randint(0, 10000))
command = [scheduler_executable, "-s", scheduler_name, "-r", "127.0.0.1:6379", "-p", plasma_socket]
if USE_VALGRIND:
self.p3 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all"] + command)
else:
self.p3 = subprocess.Popen(command)
if USE_VALGRIND:
time.sleep(1.0)
else:
time.sleep(0.1)
# Connect to the scheduler.
self.photon_client = photon.PhotonClient(scheduler_name)
def tearDown(self):
# Kill the Redis server.
self.p1.kill()
# Kill Plasma.
self.p2.kill()
# Kill the local scheduler.
if USE_VALGRIND:
self.p3.send_signal(signal.SIGTERM)
self.p3.wait()
os._exit(self.p3.returncode)
else:
self.p3.kill()
def test_submit_and_get_task(self):
# TODO(rkn): This should be a FunctionID.
function_id = photon.ObjectID(20 * "a")
object_ids = [photon.ObjectID(20 * chr(i)) for i in range(256)]
# Create and seal the objects in the object store so that we can schedule
# all of the subsequent tasks.
for object_id in object_ids:
self.plasma_client.create(object_id.id(), 0)
self.plasma_client.seal(object_id.id())
# Define some arguments to use for the tasks.
args_list = [
[],
#{},
#(),
1 * [1],
10 * [1],
100 * [1],
1000 * [1],
1 * ["a"],
10 * ["a"],
100 * ["a"],
1000 * ["a"],
[1, 1.3, 2L, 1L << 100, "hi", u"hi", [1, 2]],
object_ids[:1],
object_ids[:2],
object_ids[:3],
object_ids[:4],
object_ids[:5],
object_ids[:10],
object_ids[:100],
object_ids[:256],
[1, object_ids[0]],
[object_ids[0], "a"],
[1, object_ids[0], "a"],
[object_ids[0], 1, object_ids[1], "a"],
object_ids[:3] + [1, "hi", 2.3] + object_ids[:5],
object_ids + 100 * ["a"] + object_ids
]
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
task = photon.Task(function_id, args, num_return_vals)
# Submit a task.
self.photon_client.submit(task)
# Get the task.
new_task = self.photon_client.get_task()
self.assertEqual(task.function_id().id(), new_task.function_id().id())
retrieved_args = new_task.arguments()
returns = new_task.returns()
self.assertEqual(len(args), len(retrieved_args))
self.assertEqual(num_return_vals, len(returns))
for i in range(len(retrieved_args)):
if isinstance(args[i], photon.ObjectID):
self.assertEqual(args[i].id(), retrieved_args[i].id())
else:
self.assertEqual(args[i], retrieved_args[i])
# Submit all of the tasks.
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
task = photon.Task(function_id, args, num_return_vals)
self.photon_client.submit(task)
# Get all of the tasks.
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
new_task = self.photon_client.get_task()
def test_scheduling_when_objects_ready(self):
# Create a task and submit it.
object_id = photon.ObjectID(20 * chr(0))
# TODO(rkn): This should be a FunctionID.
function_id = photon.ObjectID(20 * "a")
task = photon.Task(function_id, [object_id], 0)
self.photon_client.submit(task)
# Launch a thread to get the task.
def get_task():
self.photon_client.get_task()
t = threading.Thread(target=get_task)
t.start()
# Sleep to give the thread time to call get_task.
time.sleep(0.1)
# Create and seal the object ID in the object store. This should trigger a
# scheduling event.
self.plasma_client.create(object_id.id(), 0)
self.plasma_client.seal(object_id.id())
# Wait until the thread finishes so that we know the task was scheduled.
t.join()
if __name__ == "__main__":
if len(sys.argv) > 1:
# pop the argument so we don't mess with unittest's own argument parser
arg = sys.argv.pop()
if arg == "valgrind":
USE_VALGRIND = True
print("Using valgrind for tests")
unittest.main(verbosity=2)
+40
View File
@@ -0,0 +1,40 @@
CC = gcc
CFLAGS = -g -Wall --std=c99 -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L -I. -Icommon -Icommon/thirdparty
BUILD = build
all: $(BUILD)/plasma_store $(BUILD)/plasma_manager $(BUILD)/plasma_client.so $(BUILD)/example $(BUILD)/libplasma_client.a
debug: FORCE
debug: CFLAGS += -DRAY_COMMON_DEBUG=1
debug: all
clean:
cd common; make clean
rm -r $(BUILD)/*
$(BUILD)/plasma_store: src/plasma_store.c src/plasma.h src/fling.h src/fling.c src/malloc.c src/malloc.h thirdparty/dlmalloc.c common
$(CC) $(CFLAGS) src/plasma_store.c src/fling.c src/malloc.c common/build/libcommon.a -o $(BUILD)/plasma_store
$(BUILD)/plasma_manager: src/plasma_manager.c src/plasma.h src/plasma_client.c src/fling.h src/fling.c common
$(CC) $(CFLAGS) src/plasma_manager.c src/plasma_client.c src/fling.c common/build/libcommon.a common/thirdparty/hiredis/libhiredis.a -o $(BUILD)/plasma_manager
$(BUILD)/plasma_client.so: src/plasma_client.c src/fling.h src/fling.c common
$(CC) $(CFLAGS) src/plasma_client.c src/fling.c common/build/libcommon.a -fPIC -shared -o $(BUILD)/plasma_client.so
$(BUILD)/libplasma_client.a: src/plasma_client.o src/fling.o
ar rcs $@ $^
$(BUILD)/example: src/plasma_client.c src/plasma.h src/example.c src/fling.h src/fling.c common
$(CC) $(CFLAGS) src/plasma_client.c src/example.c src/fling.c common/build/libcommon.a -o $(BUILD)/example
common: FORCE
git submodule update --init --recursive
cd common; make
# Set the request timeout low for testing purposes.
test: CFLAGS += -DRAY_TIMEOUT=50
test: FORCE
cd common; make redis
test: all
FORCE:
View File
File diff suppressed because it is too large Load Diff
+47
View File
@@ -0,0 +1,47 @@
/* A simple example on how to use the plasma store
*
* Can be called in the following way:
*
* cd build
* ./plasma_store -s /tmp/plasma_socket
* ./example -s /tmp/plasma_socket -g
* ./example -s /tmp/plasma_socket -c -f */
#include <stdlib.h>
#include <getopt.h>
#include <unistd.h>
#include <assert.h>
#include "plasma.h"
#include "plasma_client.h"
int main(int argc, char *argv[]) {
plasma_connection *conn = NULL;
int64_t size;
uint8_t *data;
int c;
object_id id = {{255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255}};
while ((c = getopt(argc, argv, "s:cfg")) != -1) {
switch (c) {
case 's':
conn = plasma_connect(optarg, NULL, 0);
break;
case 'c':
assert(conn != NULL);
plasma_create(conn, id, 100, NULL, 0, &data);
break;
case 'f':
assert(conn != NULL);
plasma_seal(conn, id);
break;
case 'g':
plasma_get(conn, id, &size, &data, NULL, NULL);
break;
default:
abort();
}
}
assert(conn != NULL);
plasma_disconnect(conn);
}
+81
View File
@@ -0,0 +1,81 @@
#include "fling.h"
#include <string.h>
void init_msg(struct msghdr *msg,
struct iovec *iov,
char *buf,
size_t buf_len) {
iov->iov_base = buf;
iov->iov_len = 1;
msg->msg_iov = iov;
msg->msg_iovlen = 1;
msg->msg_control = buf;
msg->msg_controllen = buf_len;
msg->msg_name = NULL;
msg->msg_namelen = 0;
}
int send_fd(int conn, int fd, const char *payload, int size) {
struct msghdr msg;
struct iovec iov;
char buf[CMSG_SPACE(sizeof(int))];
memset(&buf, 0, CMSG_SPACE(sizeof(int)));
init_msg(&msg, &iov, buf, sizeof(buf));
struct cmsghdr *header = CMSG_FIRSTHDR(&msg);
header->cmsg_level = SOL_SOCKET;
header->cmsg_type = SCM_RIGHTS;
header->cmsg_len = CMSG_LEN(sizeof(int));
*(int *) CMSG_DATA(header) = fd;
/* send file descriptor and payload */
return sendmsg(conn, &msg, 0) != -1 && send(conn, payload, size, 0) == -1;
}
int recv_fd(int conn, char *payload, int size) {
struct msghdr msg;
struct iovec iov;
char buf[CMSG_SPACE(sizeof(int))];
init_msg(&msg, &iov, buf, sizeof(buf));
if (recvmsg(conn, &msg, 0) == -1)
return -1;
int found_fd = -1;
int oh_noes = 0;
for (struct cmsghdr *header = CMSG_FIRSTHDR(&msg); header != NULL;
header = CMSG_NXTHDR(&msg, header))
if (header->cmsg_level == SOL_SOCKET && header->cmsg_type == SCM_RIGHTS) {
int count =
(header->cmsg_len - (CMSG_DATA(header) - (unsigned char *) header)) /
sizeof(int);
for (int i = 0; i < count; ++i) {
int fd = ((int *) CMSG_DATA(header))[i];
if (found_fd == -1) {
found_fd = fd;
} else {
close(fd);
oh_noes = 1;
}
}
}
/* The sender sent us more than one file descriptor. We've closed
* them all to prevent fd leaks but notify the caller that we got
* a bad message. */
if (oh_noes) {
close(found_fd);
errno = EBADMSG;
return -1;
}
ssize_t len = recv(conn, payload, size, 0);
if (len < 0) {
return -1;
}
return found_fd;
}
+35
View File
@@ -0,0 +1,35 @@
/* FLING: Exchanging file descriptors over sockets
*
* This is a little library for sending file descriptors over a socket
* between processes. The reason for doing that (as opposed to using
* filenames to share the files) is so (a) no files remain in the
* filesystem after all the processes terminate, (b) to make sure that
* there are no name collisions and (c) to be able to control who has
* access to the data.
*
* Most of the code is from https://github.com/sharvil/flingfd */
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
/* This is neccessary for Mac OS X, see http://www.apuebook.com/faqs2e.html
* (10). */
#if !defined(CMSG_SPACE) && !defined(CMSG_LEN)
#define CMSG_SPACE(len) \
(__DARWIN_ALIGN32(sizeof(struct cmsghdr)) + __DARWIN_ALIGN32(len))
#define CMSG_LEN(len) (__DARWIN_ALIGN32(sizeof(struct cmsghdr)) + (len))
#endif
void init_msg(struct msghdr *msg, struct iovec *iov, char *buf, size_t buf_len);
/* Send a file descriptor "fd" and a payload "payload" of size "size"
* over the socket "conn". Return 0 on success. */
int send_fd(int conn, int fd, const char *payload, int size);
/* Receive a file descriptor and a payload of size up to "size" from a
* socket "conn". The payload will be written to "payload" and the file
* descriptor will be returned. Returns -1 on failure. */
int recv_fd(int conn, char *payload, int size);
+246
View File
@@ -0,0 +1,246 @@
import os
import socket
import ctypes
import time
Addr = ctypes.c_ubyte * 4
PLASMA_ID_SIZE = 20
ID = ctypes.c_ubyte * PLASMA_ID_SIZE
class PlasmaID(ctypes.Structure):
_fields_ = [("plasma_id", ID)]
def make_plasma_id(string):
if len(string) != PLASMA_ID_SIZE:
raise Exception("PlasmaIDs must be {} characters long".format(PLASMA_ID_SIZE))
object_id = map(ord, string)
return PlasmaID(plasma_id=ID(*object_id))
class PlasmaBuffer(object):
"""This is the type of objects returned by calls to get with a PlasmaClient.
We define our own class instead of directly returning a buffer object so that
we can add a custom destructor which notifies Plasma that the object is no
longer being used, so the memory in the Plasma store backing the object can
potentially be freed.
Attributes:
buffer (buffer): A buffer containing an object in the Plasma store.
plasma_id (PlasmaID): The ID of the object in the buffer.
plasma_client (PlasmaClient): The PlasmaClient that we use to communicate
with the store and manager.
"""
def __init__(self, buff, plasma_id, plasma_client):
"""Initialize a PlasmaBuffer."""
self.buffer = buff
self.plasma_id = plasma_id
self.plasma_client = plasma_client
def __del__(self):
"""Notify Plasma that the object is no longer needed."""
self.plasma_client.client.plasma_release(self.plasma_client.plasma_conn, self.plasma_id)
def __getitem__(self, index):
"""Read from the PlasmaBuffer as if it were just a regular buffer."""
return self.buffer[index]
def __setitem__(self, index, value):
"""Write to the PlasmaBuffer as if it were just a regular buffer.
This should fail because the buffer should be read only.
"""
self.buffer[index] = value
def __len__(self):
"""Return the length of the buffer."""
return len(self.buffer)
class PlasmaClient(object):
"""The PlasmaClient is used to interface with a plasma store and a plasma manager.
The PlasmaClient can ask the PlasmaStore to allocate a new buffer, seal a
buffer, and get a buffer. Buffers are referred to by object IDs, which are
strings.
"""
def __init__(self, socket_name, addr=None, port=None):
"""Initialize the PlasmaClient.
Args:
socket_name (str): Name of the socket the plasma store is listening at.
addr (str): IPv4 address of plasma manager attached to the plasma store.
port (int): Port number of the plasma manager attached to the plasma store.
"""
if port is not None:
if not isinstance(port, int):
raise Exception("The 'port' argument must be an integer. The given argument has type {}.".format(type(port)))
if not 0 < port < 65536:
raise Exception("The 'port' argument must be greater than 0 and less than 65536. The given value is {}.".format(port))
plasma_client_library = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../build/plasma_client.so")
self.client = ctypes.cdll.LoadLibrary(plasma_client_library)
self.client.plasma_connect.restype = ctypes.c_void_p
self.client.plasma_create.restype = None
self.client.plasma_get.restype = None
self.client.plasma_release.restype = None
self.client.plasma_contains.restype = None
self.client.plasma_seal.restype = None
self.client.plasma_delete.restype = None
self.client.plasma_subscribe.restype = ctypes.c_int
self.buffer_from_memory = ctypes.pythonapi.PyBuffer_FromMemory
self.buffer_from_memory.argtypes = [ctypes.c_void_p, ctypes.c_int64]
self.buffer_from_memory.restype = ctypes.py_object
self.buffer_from_read_write_memory = ctypes.pythonapi.PyBuffer_FromReadWriteMemory
self.buffer_from_read_write_memory.argtypes = [ctypes.c_void_p, ctypes.c_int64]
self.buffer_from_read_write_memory.restype = ctypes.py_object
if addr is not None and port is not None:
self.has_manager_conn = True
self.plasma_conn = ctypes.c_void_p(self.client.plasma_connect(socket_name, addr, port))
else:
self.has_manager_conn = False
self.plasma_conn = ctypes.c_void_p(self.client.plasma_connect(socket_name, None, 0))
def create(self, object_id, size, metadata=None):
"""Create a new buffer in the PlasmaStore for a particular object ID.
The returned buffer is mutable until seal is called.
Args:
object_id (str): A string used to identify an object.
size (int): The size in bytes of the created buffer.
metadata (buffer): An optional buffer encoding whatever metadata the user
wishes to encode.
"""
# This is used to hold the address of the buffer.
data = ctypes.c_void_p()
# Turn the metadata into the right type.
metadata = buffer("") if metadata is None else metadata
metadata = (ctypes.c_ubyte * len(metadata)).from_buffer_copy(metadata)
self.client.plasma_create(self.plasma_conn, make_plasma_id(object_id), size, ctypes.cast(metadata, ctypes.POINTER(ctypes.c_ubyte * len(metadata))), len(metadata), ctypes.byref(data))
return PlasmaBuffer(self.buffer_from_read_write_memory(data, size), make_plasma_id(object_id), self)
def get(self, object_id):
"""Create a buffer from the PlasmaStore based on object ID.
If the object has not been sealed yet, this call will block. The retrieved
buffer is immutable.
Args:
object_id (str): A string used to identify an object.
"""
size = ctypes.c_int64()
data = ctypes.c_void_p()
metadata_size = ctypes.c_int64()
metadata = ctypes.c_void_p()
self.client.plasma_get(self.plasma_conn, make_plasma_id(object_id), ctypes.byref(size), ctypes.byref(data), ctypes.byref(metadata_size), ctypes.byref(metadata))
return PlasmaBuffer(self.buffer_from_memory(data, size), make_plasma_id(object_id), self)
def get_metadata(self, object_id):
"""Create a buffer from the PlasmaStore based on object ID.
If the object has not been sealed yet, this call will block until the object
has been sealed. The retrieved buffer is immutable.
Args:
object_id (str): A string used to identify an object.
"""
size = ctypes.c_int64()
data = ctypes.c_void_p()
metadata_size = ctypes.c_int64()
metadata = ctypes.c_void_p()
self.client.plasma_get(self.plasma_conn, make_plasma_id(object_id), ctypes.byref(size), ctypes.byref(data), ctypes.byref(metadata_size), ctypes.byref(metadata))
return PlasmaBuffer(self.buffer_from_memory(metadata, metadata_size), make_plasma_id(object_id), self)
def contains(self, object_id):
"""Check if the object is present and has been sealed in the PlasmaStore.
Args:
object_id (str): A string used to identify an object.
"""
has_object = ctypes.c_int()
self.client.plasma_contains(self.plasma_conn, make_plasma_id(object_id), ctypes.byref(has_object))
has_object = has_object.value
if has_object == 1:
return True
elif has_object == 0:
return False
else:
raise Exception("This code should be unreachable.")
def seal(self, object_id):
"""Seal the buffer in the PlasmaStore for a particular object ID.
Once a buffer has been sealed, the buffer is immutable and can only be
accessed through get.
Args:
object_id (str): A string used to identify an object.
"""
self.client.plasma_seal(self.plasma_conn, make_plasma_id(object_id))
def delete(self, object_id):
"""Delete the buffer in the PlasmaStore for a particular object ID.
Once a buffer has been deleted, the buffer is no longer accessible.
Args:
object_id (str): A string used to identify an object.
"""
self.client.plasma_delete(self.plasma_conn, make_plasma_id(object_id))
def transfer(self, addr, port, object_id):
"""Transfer local object with id object_id to another plasma instance
Args:
addr (str): IPv4 address of the plasma instance the object is sent to.
port (int): Port number of the plasma instance the object is sent to.
object_id (str): A string used to identify an object.
"""
if not self.has_manager_conn:
raise Exception("Not connected to the plasma manager socket")
self.client.plasma_transfer(self.plasma_conn, addr, port, make_plasma_id(object_id))
def fetch(self, object_ids):
"""Fetch the object with id object_id from another plasma manager instance.
Args:
object_id (str): A string used to identify an object.
"""
object_id_array = (len(object_ids) * PlasmaID)()
for i, object_id in enumerate(object_ids):
object_id_array[i] = make_plasma_id(object_id)
success_array = (len(object_ids) * ctypes.c_int)()
if not self.has_manager_conn:
raise Exception("Not connected to the plasma manager socket")
self.client.plasma_fetch(self.plasma_conn,
object_id_array._length_,
object_id_array,
success_array);
return [bool(success) for success in success_array]
def subscribe(self):
"""Subscribe to notifications about sealed objects."""
fd = self.client.plasma_subscribe(self.plasma_conn)
self.notification_sock = socket.fromfd(fd, socket.AF_UNIX, socket.SOCK_STREAM)
# Make the socket non-blocking.
self.notification_sock.setblocking(0)
def get_next_notification(self):
"""Get the next notification from the notification socket."""
if not self.notification_sock:
raise Exception("To get notifications, first call subscribe.")
# Loop until we've read PLASMA_ID_SIZE bytes from the socket.
while True:
try:
message_data = self.notification_sock.recv(PLASMA_ID_SIZE)
except socket.error:
time.sleep(0.001)
else:
assert len(message_data) == PLASMA_ID_SIZE
break
return message_data
+140
View File
@@ -0,0 +1,140 @@
#include <assert.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include "common.h"
#include "plasma.h"
#include "uthash.h"
void *fake_mmap(size_t);
int fake_munmap(void *, size_t);
#define MMAP(s) fake_mmap(s)
#define MUNMAP(a, s) fake_munmap(a, s)
#define DIRECT_MMAP(s) fake_mmap(s)
#define DIRECT_MUNMAP(a, s) fake_munmap(a, s)
#define USE_DL_PREFIX
#define HAVE_MORECORE 0
#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
#define DEFAULT_GRANULARITY ((size_t) 128U * 1024U)
#include "thirdparty/dlmalloc.c"
#undef MMAP
#undef MUNMAP
#undef DIRECT_MMAP
#undef DIRECT_MUNMAP
#undef USE_DL_PREFIX
#undef HAVE_MORECORE
#undef DEFAULT_GRANULARITY
struct mmap_record {
int fd;
void *pointer;
int64_t size;
UT_hash_handle hh_fd;
UT_hash_handle hh_pointer;
};
/* TODO(rshin): Don't have two hash tables. */
struct mmap_record *records_by_fd = NULL;
struct mmap_record *records_by_pointer = NULL;
const int GRANULARITY_MULTIPLIER = 2;
/* Create a buffer. This is creating a temporary file and then
* immediately unlinking it so we do not leave traces in the system. */
int create_buffer(int64_t size) {
static char template[] = "/tmp/plasmaXXXXXX";
char file_name[32];
strncpy(file_name, template, 32);
int fd = mkstemp(file_name);
if (fd < 0)
return -1;
FILE *file = fdopen(fd, "a+");
if (!file) {
close(fd);
return -1;
}
if (unlink(file_name) != 0) {
LOG_ERR("unlink error");
return -1;
}
if (ftruncate(fd, (off_t) size) != 0) {
LOG_ERR("ftruncate error");
return -1;
}
return fd;
}
void *fake_mmap(size_t size) {
/* Add sizeof(size_t) so that the returned pointer is deliberately not
* page-aligned. This ensures that the segments of memory returned by
* fake_mmap are never contiguous. */
size += sizeof(size_t);
int fd = create_buffer(size);
void *pointer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (pointer == MAP_FAILED) {
return pointer;
}
/* Increase dlmalloc's allocation granularity directly. */
mparams.granularity *= GRANULARITY_MULTIPLIER;
struct mmap_record *record = malloc(sizeof(struct mmap_record));
record->fd = fd;
record->pointer = pointer;
record->size = size;
HASH_ADD(hh_fd, records_by_fd, fd, sizeof(fd), record);
HASH_ADD(hh_pointer, records_by_pointer, pointer, sizeof(pointer), record);
/* We lie to dlmalloc about where mapped memory actually lives. */
pointer += sizeof(size_t);
LOG_DEBUG("%p = fake_mmap(%lu)", pointer, size);
return pointer;
}
int fake_munmap(void *addr, size_t size) {
LOG_DEBUG("fake_munmap(%p, %lu)", addr, size);
addr -= sizeof(size_t);
size += sizeof(size_t);
struct mmap_record *record;
HASH_FIND(hh_pointer, records_by_pointer, &addr, sizeof(addr), record);
if (record == NULL || record->size != size) {
/* Reject requests to munmap that don't directly match previous
* calls to mmap, to prevent dlmalloc from trimming. */
return -1;
}
close(record->fd);
HASH_DELETE(hh_fd, records_by_fd, record);
HASH_DELETE(hh_pointer, records_by_pointer, record);
return munmap(addr, size);
}
void get_malloc_mapinfo(void *addr,
int *fd,
int64_t *map_size,
ptrdiff_t *offset) {
struct mmap_record *record;
/* TODO(rshin): Implement a more efficient search through records_by_fd. */
for (record = records_by_fd; record != NULL; record = record->hh_fd.next) {
if (addr >= record->pointer && addr < record->pointer + record->size) {
*fd = record->fd;
*map_size = record->size;
*offset = addr - record->pointer;
return;
}
}
*fd = -1;
*map_size = 0;
*offset = 0;
}
+9
View File
@@ -0,0 +1,9 @@
#ifndef MALLOC_H
#define MALLOC_H
void get_malloc_mapinfo(void *addr,
int *fd,
int64_t *map_length,
ptrdiff_t *offset);
#endif /* MALLOC_H */
+96
View File
@@ -0,0 +1,96 @@
#ifndef PLASMA_H
#define PLASMA_H
#include <inttypes.h>
#include <stdio.h>
#include <errno.h>
#include <stddef.h>
#include <string.h>
#include "common.h"
typedef struct {
int64_t data_size;
int64_t metadata_size;
int64_t create_time;
int64_t construct_duration;
} plasma_object_info;
/* Handle to access memory mapped file and map it into client address space */
typedef struct {
/** The file descriptor of the memory mapped file in the store. It is used
* as a unique identifier of the file in the client to look up the
* corresponding file descriptor on the client's side. */
int store_fd;
/** The size in bytes of the memory mapped file. */
int64_t mmap_size;
} object_handle;
typedef struct {
/** Handle for memory mapped file the object is stored in. */
object_handle handle;
/** The offset in bytes in the memory mapped file of the data. */
ptrdiff_t data_offset;
/** The offset in bytes in the memory mapped file of the metadata. */
ptrdiff_t metadata_offset;
/** The size in bytes of the data. */
int64_t data_size;
/** The size in bytes of the metadata. */
int64_t metadata_size;
} plasma_object;
enum object_status { OBJECT_NOT_FOUND = 0, OBJECT_FOUND = 1 };
enum plasma_message_type {
/** Create a new object. */
PLASMA_CREATE = 128,
/** Get an object. */
PLASMA_GET,
/** Tell the store that the client no longer needs an object. */
PLASMA_RELEASE,
/** Check if an object is present. */
PLASMA_CONTAINS,
/** Seal an object. */
PLASMA_SEAL,
/** Delete an object. */
PLASMA_DELETE,
/** Subscribe to notifications about sealed objects. */
PLASMA_SUBSCRIBE,
/** Request transfer to another store. */
PLASMA_TRANSFER,
/** Header for sending data. */
PLASMA_DATA,
/** Request a fetch of an object in another store. */
PLASMA_FETCH,
};
typedef struct {
/** The size of the object's data. */
int64_t data_size;
/** The size of the object's metadata. */
int64_t metadata_size;
/** In a transfer request, this is the IP address of the Plasma Manager to
* transfer the object to. */
uint8_t addr[4];
/** In a transfer request, this is the port of the Plasma Manager to transfer
* the object to. */
int port;
/** The number of object IDs that will be included in this request. */
int num_object_ids;
/** The IDs of the objects that the request is about. */
object_id object_ids[1];
} plasma_request;
typedef struct {
/** The object ID that this reply refers to. */
object_id object_id;
/** The object that is returned with this reply. */
plasma_object object;
/** This is used only to respond to requests of type
* PLASMA_CONTAINS or PLASMA_FETCH. It is 1 if the object is
* present and 0 otherwise. Used for plasma_contains and
* plasma_fetch. */
int has_object;
} plasma_reply;
#endif
+434
View File
@@ -0,0 +1,434 @@
/* PLASMA CLIENT: Client library for using the plasma store and manager */
#include <assert.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <strings.h>
#include <netinet/in.h>
#include <netdb.h>
#include "common.h"
#include "io.h"
#include "plasma.h"
#include "plasma_client.h"
#include "fling.h"
#include "uthash.h"
typedef struct {
/** Key that uniquely identifies the memory mapped file. In practice, we
* take the numerical value of the file descriptor in the object store. */
int key;
/** The result of mmap for this file descriptor. */
uint8_t *pointer;
/** The length of the memory-mapped file. */
size_t length;
/** The number of objects in this memory-mapped file that are currently being
* used by the client. When this count reaches zeros, we unmap the file. */
int count;
/** Handle for the uthash table. */
UT_hash_handle hh;
} client_mmap_table_entry;
typedef struct {
/** The ID of the object. This is used as the key in the hash table. */
object_id object_id;
/** The file descriptor of the memory-mapped file that contains the object. */
int fd;
/** A count of the number of times this client has called plasma_create or
* plasma_get on this object ID minus the number of calls to plasma_release.
* When this count reaches zero, we remove the entry from the objects_in_use
* and decrement a count in the relevant client_mmap_table_entry. */
int count;
/** Handle for the uthash table. */
UT_hash_handle hh;
} object_in_use_entry;
/** Information about a connection between a Plasma Client and Plasma Store.
* This is used to avoid mapping the same files into memory multiple times. */
struct plasma_connection {
/** File descriptor of the Unix domain socket that connects to the store. */
int store_conn;
/** File descriptor of the Unix domain socket that connects to the manager. */
int manager_conn;
/** Table of dlmalloc buffer files that have been memory mapped so far. This
* is a hash table mapping a file descriptor to a struct containing the
* address of the corresponding memory-mapped file. */
client_mmap_table_entry *mmap_table;
/** A hash table of the object IDs that are currently being used by this
* client. */
object_in_use_entry *objects_in_use;
};
int plasma_request_size(int num_object_ids) {
int object_ids_size = (num_object_ids - 1) * sizeof(object_id);
return sizeof(plasma_request) + object_ids_size;
}
void plasma_send_request(int fd, int type, plasma_request *req) {
int req_size = plasma_request_size(req->num_object_ids);
int error = write_message(fd, type, req_size, (uint8_t *) req);
/* TODO(swang): Actually handle the write error. */
CHECK(!error);
}
plasma_request make_plasma_request(object_id object_id) {
plasma_request req = {.num_object_ids = 1, .object_ids = {object_id}};
return req;
}
plasma_request *make_plasma_multiple_request(int num_object_ids,
object_id object_ids[]) {
int req_size = plasma_request_size(num_object_ids);
plasma_request *req = malloc(req_size);
req->num_object_ids = num_object_ids;
memcpy(&req->object_ids, object_ids, num_object_ids * sizeof(object_id));
return req;
}
/* If the file descriptor fd has been mmapped in this client process before,
* return the pointer that was returned by mmap, otherwise mmap it and store the
* pointer in a hash table. */
uint8_t *lookup_or_mmap(plasma_connection *conn,
int fd,
int store_fd_val,
int64_t map_size) {
client_mmap_table_entry *entry;
HASH_FIND_INT(conn->mmap_table, &store_fd_val, entry);
if (entry) {
close(fd);
return entry->pointer;
} else {
uint8_t *result =
mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (result == MAP_FAILED) {
LOG_ERR("mmap failed");
exit(-1);
}
close(fd);
entry = malloc(sizeof(client_mmap_table_entry));
entry->key = store_fd_val;
entry->pointer = result;
entry->length = map_size;
entry->count = 0;
HASH_ADD_INT(conn->mmap_table, key, entry);
return result;
}
}
void increment_object_count(plasma_connection *conn,
object_id object_id,
int fd) {
/* Increment the count of the object to track the fact that it is being used.
* The corresponding decrement should happen in plasma_release. */
object_in_use_entry *object_entry;
HASH_FIND(hh, conn->objects_in_use, &object_id, sizeof(object_id),
object_entry);
if (object_entry == NULL) {
/* Add this object ID to the hash table of object IDs in use. The
* corresponding call to free happens in plasma_release. */
object_entry = malloc(sizeof(object_in_use_entry));
object_entry->object_id = object_id;
object_entry->fd = fd;
object_entry->count = 0;
HASH_ADD(hh, conn->objects_in_use, object_id, sizeof(object_id),
object_entry);
/* Increment the count of the number of objects in the memory-mapped file
* that are being used. The corresponding decrement should happen in
* plasma_release. */
client_mmap_table_entry *entry;
HASH_FIND_INT(conn->mmap_table, &object_entry->fd, entry);
CHECK(entry != NULL);
CHECK(entry->count >= 0);
entry->count += 1;
} else {
CHECK(object_entry->count > 0);
}
/* Increment the count of the number of instances of this object that are
* being used by this client. The corresponding decrement should happen in
* plasma_release. */
object_entry->count += 1;
}
void plasma_create(plasma_connection *conn,
object_id object_id,
int64_t data_size,
uint8_t *metadata,
int64_t metadata_size,
uint8_t **data) {
LOG_DEBUG("called plasma_create on conn %d with size %" PRId64
" and metadata size "
"%" PRId64,
conn->store_conn, data_size, metadata_size);
plasma_request req = make_plasma_request(object_id);
req.data_size = data_size;
req.metadata_size = metadata_size;
plasma_send_request(conn->store_conn, PLASMA_CREATE, &req);
plasma_reply reply;
int fd = recv_fd(conn->store_conn, (char *) &reply, sizeof(plasma_reply));
plasma_object *object = &reply.object;
CHECK(object->data_size == data_size);
CHECK(object->metadata_size == metadata_size);
/* The metadata should come right after the data. */
CHECK(object->metadata_offset == object->data_offset + data_size);
*data = lookup_or_mmap(conn, fd, object->handle.store_fd,
object->handle.mmap_size) +
object->data_offset;
/* If plasma_create is being called from a transfer, then we will not copy the
* metadata here. The metadata will be written along with the data streamed
* from the transfer. */
if (metadata != NULL) {
/* Copy the metadata to the buffer. */
memcpy(*data + object->data_size, metadata, metadata_size);
}
/* Increment the count of the number of instances of this object that this
* client is using. A call to plasma_release is required to decrement this
* count. */
increment_object_count(conn, object_id, object->handle.store_fd);
}
/* This method is used to get both the data and the metadata. */
void plasma_get(plasma_connection *conn,
object_id object_id,
int64_t *size,
uint8_t **data,
int64_t *metadata_size,
uint8_t **metadata) {
plasma_request req = make_plasma_request(object_id);
plasma_send_request(conn->store_conn, PLASMA_GET, &req);
plasma_reply reply;
int fd = recv_fd(conn->store_conn, (char *) &reply, sizeof(plasma_reply));
CHECKM(fd != -1, "recv not successful");
plasma_object *object = &reply.object;
*data = lookup_or_mmap(conn, fd, object->handle.store_fd,
object->handle.mmap_size) +
object->data_offset;
*size = object->data_size;
/* If requested, return the metadata as well. */
if (metadata != NULL) {
*metadata = *data + object->data_size;
*metadata_size = object->metadata_size;
}
/* Increment the count of the number of instances of this object that this
* client is using. A call to plasma_release is required to decrement this
* count. */
increment_object_count(conn, object_id, object->handle.store_fd);
}
void plasma_release(plasma_connection *conn, object_id object_id) {
/* Decrement the count of the number of instances of this object that are
* being used by this client. The corresponding increment should have happened
* in plasma_get. */
object_in_use_entry *object_entry;
HASH_FIND(hh, conn->objects_in_use, &object_id, sizeof(object_id),
object_entry);
CHECK(object_entry != NULL);
object_entry->count -= 1;
CHECK(object_entry->count >= 0);
/* Check if the client is no longer using this object. */
if (object_entry->count == 0) {
/* Decrement the count of the number of objects in this memory-mapped file
* that the client is using. The corresponding increment should have
* happened in plasma_get. */
client_mmap_table_entry *entry;
HASH_FIND_INT(conn->mmap_table, &object_entry->fd, entry);
CHECK(entry != NULL);
entry->count -= 1;
CHECK(entry->count >= 0);
/* If none are being used then unmap the file. */
if (entry->count == 0) {
munmap(entry->pointer, entry->length);
/* Remove the corresponding entry from the hash table. */
HASH_DELETE(hh, conn->mmap_table, entry);
free(entry);
}
/* Tell the store that the client no longer needs the object. */
plasma_request req = make_plasma_request(object_id);
plasma_send_request(conn->store_conn, PLASMA_RELEASE, &req);
/* Remove the entry from the hash table of objects currently in use. */
HASH_DELETE(hh, conn->objects_in_use, object_entry);
free(object_entry);
}
}
/* This method is used to query whether the plasma store contains an object. */
void plasma_contains(plasma_connection *conn,
object_id object_id,
int *has_object) {
plasma_request req = make_plasma_request(object_id);
plasma_send_request(conn->store_conn, PLASMA_CONTAINS, &req);
plasma_reply reply;
int r = read(conn->store_conn, &reply, sizeof(plasma_reply));
CHECKM(r != -1, "read error");
CHECKM(r != 0, "connection disconnected");
*has_object = reply.has_object;
}
void plasma_seal(plasma_connection *conn, object_id object_id) {
plasma_request req = make_plasma_request(object_id);
plasma_send_request(conn->store_conn, PLASMA_SEAL, &req);
if (conn->manager_conn >= 0) {
plasma_send_request(conn->manager_conn, PLASMA_SEAL, &req);
}
}
void plasma_delete(plasma_connection *conn, object_id object_id) {
plasma_request req = make_plasma_request(object_id);
plasma_send_request(conn->store_conn, PLASMA_DELETE, &req);
}
int plasma_subscribe(plasma_connection *conn) {
int fd[2];
/* Create a non-blocking socket pair. This will only be used to send
* notifications from the Plasma store to the client. */
socketpair(AF_UNIX, SOCK_STREAM, 0, fd);
/* Make the socket non-blocking. */
int flags = fcntl(fd[1], F_GETFL, 0);
CHECK(fcntl(fd[1], F_SETFL, flags | O_NONBLOCK) == 0);
/* Tell the Plasma store about the subscription. */
plasma_request req = {};
plasma_send_request(conn->store_conn, PLASMA_SUBSCRIBE, &req);
/* Send the file descriptor that the Plasma store should use to push
* notifications about sealed objects to this client. We include a one byte
* message because otherwise it seems to hang on Linux. */
char dummy = '\0';
send_fd(conn->store_conn, fd[1], &dummy, 1);
/* Return the file descriptor that the client should use to read notifications
* about sealed objects. */
return fd[0];
}
plasma_connection *plasma_connect(const char *store_socket_name,
const char *manager_addr,
int manager_port) {
CHECK(store_socket_name);
/* Try to connect to the Plasma store. If unsuccessful, retry several times.
*/
int fd = -1;
int connected_successfully = 0;
for (int num_attempts = 0; num_attempts < 50; ++num_attempts) {
fd = connect_ipc_sock(store_socket_name);
if (fd >= 0) {
connected_successfully = 1;
break;
}
/* Sleep for 100 milliseconds. */
usleep(100000);
}
/* If we could not connect to the Plasma store, exit. */
if (!connected_successfully) {
LOG_ERR("could not connect to store %s", store_socket_name);
exit(-1);
}
/* Initialize the store connection struct */
plasma_connection *result = malloc(sizeof(plasma_connection));
result->store_conn = fd;
if (manager_addr != NULL) {
result->manager_conn = plasma_manager_connect(manager_addr, manager_port);
} else {
result->manager_conn = -1;
}
result->mmap_table = NULL;
result->objects_in_use = NULL;
return result;
}
void plasma_disconnect(plasma_connection *conn) {
close(conn->store_conn);
if (conn->manager_conn >= 0) {
close(conn->manager_conn);
}
free(conn);
}
#define h_addr h_addr_list[0]
/* TODO(swang): Return the error to the caller. */
int plasma_manager_connect(const char *ip_addr, int port) {
int fd = socket(PF_INET, SOCK_STREAM, 0);
if (fd < 0) {
LOG_ERR("could not create socket");
exit(-1);
}
struct hostent *manager = gethostbyname(ip_addr); /* TODO(pcm): cache this */
if (!manager) {
LOG_ERR("plasma manager %s not found", ip_addr);
exit(-1);
}
struct sockaddr_in addr;
addr.sin_family = AF_INET;
memcpy(&addr.sin_addr.s_addr, manager->h_addr, manager->h_length);
addr.sin_port = htons(port);
int r = connect(fd, (struct sockaddr *) &addr, sizeof(addr));
if (r < 0) {
LOG_ERR(
"could not establish connection to manager with id %s:%d (probably ran "
"out of ports)",
&ip_addr[0], port);
exit(-1);
}
return fd;
}
void plasma_transfer(plasma_connection *conn,
const char *addr,
int port,
object_id object_id) {
plasma_request req = make_plasma_request(object_id);
req.port = port;
char *end = NULL;
for (int i = 0; i < 4; ++i) {
req.addr[i] = strtol(end ? end : addr, &end, 10);
/* skip the '.' */
end += 1;
}
plasma_send_request(conn->manager_conn, PLASMA_TRANSFER, &req);
}
void plasma_fetch(plasma_connection *conn,
int num_object_ids,
object_id object_ids[],
int is_fetched[]) {
CHECK(conn->manager_conn >= 0);
plasma_request *req =
make_plasma_multiple_request(num_object_ids, object_ids);
LOG_DEBUG("Requesting fetch");
plasma_send_request(conn->manager_conn, PLASMA_FETCH, req);
free(req);
plasma_reply reply;
int nbytes, success;
for (int received = 0; received < num_object_ids; ++received) {
nbytes = recv(conn->manager_conn, (uint8_t *) &reply, sizeof(reply),
MSG_WAITALL);
if (nbytes < 0) {
LOG_ERR("Error while waiting for manager response in fetch");
success = 0;
} else if (nbytes == 0) {
success = 0;
} else {
CHECK(nbytes == sizeof(reply));
success = reply.has_object;
}
/* Update the correct index in is_fetched. */
int i = 0;
for (; i < num_object_ids; i++) {
if (memcmp(&object_ids[i], &reply.object_id, sizeof(object_id)) == 0) {
/* Check that this isn't a duplicate response. */
CHECK(!is_fetched[i]);
is_fetched[i] = success;
break;
}
}
CHECKM(i != num_object_ids,
"Received unexpected object ID from manager during fetch.");
}
}
+198
View File
@@ -0,0 +1,198 @@
#ifndef PLASMA_CLIENT_H
#define PLASMA_CLIENT_H
#include "plasma.h"
typedef struct plasma_connection plasma_connection;
/**
* This is used by the Plasma Client to send a request to the Plasma Store or
* the Plasma Manager.
*
* @param conn The file descriptor to use to send the request.
* @param type The type of request.
* @param req The address of the request to send.
* @return Void.
*/
void plasma_send_request(int fd, int type, plasma_request *req);
/**
* Create a plasma request to be sent with a single object ID.
*
* @param object_id The object ID to include in the request.
* @return The plasma request.
*/
plasma_request make_plasma_request(object_id object_id);
/**
* Create a plasma request to be sent with multiple object ID. Caller must free
* the returned plasma request pointer.
*
* @param num_object_ids The number of object IDs to include in the request.
* @param object_ids The array of object IDs to include in the request. It must
* have length at least equal to num_object_ids.
* @return A pointer to the newly created plasma request.
*/
plasma_request *make_plasma_multiple_request(int num_object_ids,
object_id object_ids[]);
/**
* Connect to the local plasma store and plasma manager. Return
* the resulting connection.
*
* @param socket_name The name of the UNIX domain socket to use to connect to
* the Plasma Store.
* @param manager_addr The IP address of the plasma manager to connect to. If
* this is NULL, then this function will not connect to a manager.
* @param manager_port The port of the plasma manager to connect to. If
* manager_addr is NULL, then this argument is unused.
* @return The object containing the connection state.
*/
plasma_connection *plasma_connect(const char *store_socket_name,
const char *manager_addr,
int manager_port);
/**
* Disconnect from the local plasma instance, including the local store and
* manager.
*
* @param conn The connection to the local plasma store and plasma manager.
* @return Void.
*/
void plasma_disconnect(plasma_connection *conn);
/**
* Connect to a possibly remote Plasma Manager.
*
* @param addr The IP address of the Plasma Manager to connect to.
* @param port The port of the Plasma Manager to connect to.
* @return The file descriptor to use to send messages to the Plasma Manager.
*/
int plasma_manager_connect(const char *addr, int port);
/**
* Create an object in the Plasma Store. Any metadata for this object must be
* be passed in when the object is created.
*
* @param conn The object containing the connection state.
* @param object_id The ID to use for the newly created object.
* @param size The size in bytes of the space to be allocated for this object's
data (this does not include space used for metadata).
* @param metadata The object's metadata. If there is no metadata, this pointer
should be NULL.
* @param metadata_size The size in bytes of the metadata. If there is no
metadata, this should be 0.
* @param data The address of the newly created object will be written here.
* @return Void.
*/
void plasma_create(plasma_connection *conn,
object_id object_id,
int64_t size,
uint8_t *metadata,
int64_t metadata_size,
uint8_t **data);
/**
* Get an object from the Plasma Store. This function will block until the
* object has been created and sealed in the Plasma Store.
*
* @param conn The object containing the connection state.
* @param object_id The ID of the object to get.
* @param size The size in bytes of the retrieved object will be written at this
address.
* @param data The address of the object will be written at this address.
* @param metadata_size The size in bytes of the object's metadata will be
* written at this address.
* @param metadata The address of the object's metadata will be written at this
* address.
* @return Void.
*/
void plasma_get(plasma_connection *conn,
object_id object_id,
int64_t *size,
uint8_t **data,
int64_t *metadata_size,
uint8_t **metadata);
/**
* Tell Plasma that the client no longer needs the object. This should be called
* after plasma_get when the client is done with the object. After this call,
* the address returned by plasma_get is no longer valid. This should be called
* once for each call to plasma_get (with the same object ID).
*
* @param conn The object containing the connection state.
* @param object_id The ID of the object that is no longer needed.
* @return Void.
*/
void plasma_release(plasma_connection *conn, object_id object_id);
/**
* Check if the object store contains a particular object and the object has
* been sealed. The result will be stored in has_object.
*
* @todo: We may want to indicate if the object has been created but not sealed.
*
* @param conn The object containing the connection state.
* @param object_id The ID of the object whose presence we are checking.
* @param has_object The function will write 1 at this address if the object is
* present and 0 if it is not present.
* @return Void.
*/
void plasma_contains(plasma_connection *conn,
object_id object_id,
int *has_object);
/**
* Seal an object in the object store. The object will be immutable after this
* call.
*
* @param conn The object containing the connection state.
* @param object_id The ID of the object to seal.
* @return Void.
*/
void plasma_seal(plasma_connection *conn, object_id object_id);
/**
* Delete an object from the object store. This currently assumes that the
* object is present and has been sealed.
*
* @todo We may want to allow the deletion of objects that are not present or
* haven't been sealed.
*
* @param conn The object containing the connection state.
* @param object_id The ID of the object to delete.
* @return Void.
*/
void plasma_delete(plasma_connection *conn, object_id object_id);
/**
* Fetch objects from remote plasma stores that have the
* objects stored.
*
* @param manager A file descriptor for the socket connection
* to the local manager.
* @param object_id_count The number of object IDs requested.
* @param object_ids[] The vector of object IDs requested. Length must be at
* least num_object_ids.
* @param is_fetched[] The vector in which to return the success
* of each object's fetch operation, in the same order as
* object_ids. Length must be at least num_object_ids.
* @return Void.
*/
void plasma_fetch(plasma_connection *conn,
int num_object_ids,
object_id object_ids[],
int is_fetched[]);
/**
* Subscribe to notifications when objects are sealed in the object store.
* Whenever an object is sealed, a message will be written to the client socket
* that is returned by this method.
*
* @param conn The object containing the connection state.
* @return The file descriptor that the client should use to read notifications
from the object store about sealed objects.
*/
int plasma_subscribe(plasma_connection *conn);
#endif
+804
View File
@@ -0,0 +1,804 @@
/* PLASMA MANAGER: Local to a node, connects to other managers to send and
* receive objects from them
*
* The storage manager listens on its main listening port, and if a request for
* transfering an object to another object store comes in, it ships the data
* using a new connection to the target object manager. */
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <strings.h>
#include <poll.h>
#include <assert.h>
#include <netinet/in.h>
#include <netdb.h>
#include "uthash.h"
#include "utlist.h"
#include "utarray.h"
#include "utstring.h"
#include "common.h"
#include "io.h"
#include "event_loop.h"
#include "plasma.h"
#include "plasma_client.h"
#include "plasma_manager.h"
#include "state/db.h"
#include "state/object_table.h"
#define NUM_RETRIES 5
/* Timeouts are in milliseconds. */
#ifndef RAY_TIMEOUT
#define MANAGER_TIMEOUT 1000
#else
#define MANAGER_TIMEOUT RAY_TIMEOUT
#endif
typedef struct client_object_connection client_object_connection;
typedef struct {
/** Event loop. */
event_loop *loop;
/** Connection to the local plasma store for reading or writing data. */
plasma_connection *plasma_conn;
/** Hash table of all contexts for active connections to
* other plasma managers. These are used for writing data to
* other plasma stores. */
client_connection *manager_connections;
db_handle *db;
/** Our address. */
uint8_t addr[4];
/** Our port. */
int port;
/** Hash table of outstanding fetch requests. The key is
* object id, value is a list of connections to the clients
* who are blocking on a fetch of this object. */
client_object_connection *fetch_connections;
} plasma_manager_state;
plasma_manager_state *g_manager_state = NULL;
typedef struct plasma_request_buffer plasma_request_buffer;
/* Buffer for requests between plasma managers. */
struct plasma_request_buffer {
int type;
object_id object_id;
uint8_t *data;
int64_t data_size;
uint8_t *metadata;
int64_t metadata_size;
/* Pointer to the next buffer that we will write to this plasma manager. This
* field is only used if we're pushing requests to another plasma manager,
* not if we are receiving data. */
plasma_request_buffer *next;
};
/* The context for fetch and wait requests. These are per client, per object. */
struct client_object_connection {
/** The ID of the object we are fetching or waiting for. */
object_id object_id;
/** The client connection context, shared between other
* client_object_connections for the same client. */
client_connection *client_conn;
/** The ID for the timer that will time out the current request to the state
* database or another plasma manager. */
int64_t timer;
/** How many retries we have left for the request. Decremented on every
* timeout. */
int num_retries;
/** Handle for a linked list. */
client_object_connection *next;
/** Pointer to the array containing the manager locations of
* this object. */
char **manager_vector;
/** The number of manager locations in the array manager_vector. */
int manager_count;
/** Handle for the uthash table in the client connection
* context that keeps track of active object connection
* contexts. */
UT_hash_handle active_hh;
/** Handle for the uthash table in the manager state that
* keeps track of outstanding fetch requests. */
UT_hash_handle fetch_hh;
};
/* Context for a client connection to another plasma manager. */
struct client_connection {
/** Current state for this plasma manager. This is shared
* between all client connections to the plasma manager. */
plasma_manager_state *manager_state;
/** Current position in the buffer. */
int64_t cursor;
/** Buffer that this connection is reading from. If this is a connection to
* write data to another plasma store, then it is a linked
* list of buffers to write. */
/* TODO(swang): Split into two queues, data transfers and data requests. */
plasma_request_buffer *transfer_queue;
/** File descriptor for the socket connected to the other
* plasma manager. */
int fd;
/** The objects that we are waiting for and their callback
* contexts, for either a fetch or a wait operation. */
client_object_connection *active_objects;
/** The number of objects that we have left to return for
* this fetch or wait operation. */
int num_return_objects;
/** Fields specific to connections to plasma managers. Key that uniquely
* identifies the plasma manager that we're connected to. We will use the
* string <address>:<port> as an identifier. */
char *ip_addr_port;
/** Handle for the uthash table. */
UT_hash_handle hh;
};
void free_client_object_connection(client_object_connection *object_conn) {
for (int i = 0; i < object_conn->manager_count; ++i) {
free(object_conn->manager_vector[i]);
}
free(object_conn->manager_vector);
free(object_conn);
}
int send_client_reply(client_connection *conn, plasma_reply *reply) {
conn->num_return_objects--;
CHECK(conn->num_return_objects >= 0);
/* TODO(swang): Handle errors in write. */
int n = write(conn->fd, (uint8_t *) reply, sizeof(plasma_reply));
return (n != sizeof(plasma_reply));
}
/**
* Get the context for the given object ID for the given client
* connection, if there is one active.
*
* @param client_conn The client connection context.
* @param object_id The object ID whose context we want.
* @return A pointer to the active object context, or NULL if
* there isn't one.
*/
client_object_connection *get_object_connection(client_connection *client_conn,
object_id object_id) {
client_object_connection *object_conn;
HASH_FIND(active_hh, client_conn->active_objects, &object_id,
sizeof(object_id), object_conn);
return object_conn;
}
/**
* Create a new context for the given object ID with the given
* client connection and register it with the manager's
* outstanding fetch or wait requests and the client
* connection's active object contexts.
*
* @param client_conn The client connection context.
* @param object_id The object ID whose context we want to
* create.
* @return A pointer to the newly created object context.
*/
client_object_connection *add_object_connection(client_connection *client_conn,
object_id object_id) {
/* TODO(swang): Support registration of wait operations. */
/* Create a new context for this client connection and object. */
client_object_connection *object_conn =
malloc(sizeof(client_object_connection));
if (!object_conn) {
return NULL;
}
object_conn->object_id = object_id;
object_conn->client_conn = client_conn;
object_conn->manager_count = 0;
object_conn->manager_vector = NULL;
/* Register the object context with the client context. */
HASH_ADD(active_hh, client_conn->active_objects, object_id, sizeof(object_id),
object_conn);
/* Register the object context with the manager state. */
client_object_connection *fetch_connections;
HASH_FIND(fetch_hh, client_conn->manager_state->fetch_connections, &object_id,
sizeof(object_id), fetch_connections);
LOG_DEBUG("Registering fd %d for fetch.", client_conn->fd);
if (!fetch_connections) {
fetch_connections = NULL;
LL_APPEND(fetch_connections, object_conn);
HASH_ADD(fetch_hh, client_conn->manager_state->fetch_connections, object_id,
sizeof(object_id), fetch_connections);
} else {
LL_APPEND(fetch_connections, object_conn);
}
return object_conn;
}
/**
* Clean up and free an active object context. Deregister it from the
* associated client connection and from the manager state.
*
* @param client_conn The client connection context.
* @param object_id The object ID whose context we want to delete.
*/
void remove_object_connection(client_connection *client_conn,
client_object_connection *object_conn) {
/* Deregister the object context with the client context. */
HASH_DELETE(active_hh, client_conn->active_objects, object_conn);
/* Deregister the object context with the manager state. */
client_object_connection *object_conns;
HASH_FIND(fetch_hh, client_conn->manager_state->fetch_connections,
&(object_conn->object_id), sizeof(object_conn->object_id),
object_conns);
CHECK(object_conns);
int len;
client_object_connection *tmp;
LL_COUNT(object_conns, tmp, len);
if (len == 1) {
HASH_DELETE(fetch_hh, client_conn->manager_state->fetch_connections,
object_conns);
}
LL_DELETE(object_conns, object_conn);
/* Free the object. */
free_client_object_connection(object_conn);
}
/* Helper function to parse a string of the form <IP address>:<port> into the
* given ip_addr and port pointers. The ip_addr buffer must already be
* allocated. */
/* TODO(swang): Move this function to Ray common. */
void parse_ip_addr_port(const char *ip_addr_port, char *ip_addr, int *port) {
char port_str[6];
int parsed = sscanf(ip_addr_port, "%15[0-9.]:%5[0-9]", ip_addr, port_str);
CHECK(parsed == 2);
*port = atoi(port_str);
}
plasma_manager_state *init_plasma_manager_state(const char *store_socket_name,
const char *manager_addr,
int manager_port,
const char *db_addr,
int db_port) {
plasma_manager_state *state = malloc(sizeof(plasma_manager_state));
state->loop = event_loop_create();
state->plasma_conn = plasma_connect(store_socket_name, NULL, 0);
state->manager_connections = NULL;
state->fetch_connections = NULL;
if (db_addr) {
state->db = db_connect(db_addr, db_port, "plasma_manager", manager_addr,
manager_port);
db_attach(state->db, state->loop);
LOG_DEBUG("Connected to db at %s:%d, assigned client ID %d", db_addr,
db_port, get_client_id(state->db));
} else {
state->db = NULL;
LOG_DEBUG("No db connection specified");
}
sscanf(manager_addr, "%hhu.%hhu.%hhu.%hhu", &state->addr[0], &state->addr[1],
&state->addr[2], &state->addr[3]);
state->port = manager_port;
return state;
}
/* Handle a command request that came in through a socket (transfering data,
* or accepting incoming data). */
void process_message(event_loop *loop,
int client_sock,
void *context,
int events);
void write_object_chunk(client_connection *conn, plasma_request_buffer *buf) {
ssize_t r, s;
/* Try to write one BUFSIZE at a time. */
s = buf->data_size + buf->metadata_size - conn->cursor;
if (s > BUFSIZE)
s = BUFSIZE;
r = write(conn->fd, buf->data + conn->cursor, s);
if (r != s) {
if (r > 0) {
LOG_ERR("partial write on fd %d", conn->fd);
} else {
LOG_ERR("write error");
exit(-1);
}
} else {
conn->cursor += r;
}
if (r == 0) {
/* If we've finished writing this buffer, reset the cursor to zero. */
LOG_DEBUG("writing on channel %d finished", conn->fd);
conn->cursor = 0;
/* We are done sending the object, so release it. The corresponding call to
* plasma_get occurred in process_transfer_request. */
plasma_release(conn->manager_state->plasma_conn, buf->object_id);
}
}
void send_queued_request(event_loop *loop,
int data_sock,
void *context,
int events) {
client_connection *conn = (client_connection *) context;
if (conn->transfer_queue == NULL) {
/* If there are no objects to transfer, temporarily remove this connection
* from the event loop. It will be reawoken when we receive another
* PLASMA_TRANSFER request. */
event_loop_remove_file(loop, conn->fd);
return;
}
plasma_request_buffer *buf = conn->transfer_queue;
plasma_request manager_req = make_plasma_request(buf->object_id);
switch (buf->type) {
case PLASMA_TRANSFER:
LOG_DEBUG("Requesting transfer on DB client %d",
get_client_id(conn->manager_state->db));
memcpy(manager_req.addr, conn->manager_state->addr,
sizeof(manager_req.addr));
manager_req.port = conn->manager_state->port;
plasma_send_request(conn->fd, buf->type, &manager_req);
break;
case PLASMA_DATA:
LOG_DEBUG("Transferring object to manager");
if (conn->cursor == 0) {
/* If the cursor is zero, we haven't sent any requests for this object
* yet,
* so send the initial PLASMA_DATA request. */
manager_req.data_size = buf->data_size;
manager_req.metadata_size = buf->metadata_size;
plasma_send_request(conn->fd, PLASMA_DATA, &manager_req);
}
write_object_chunk(conn, buf);
break;
default:
LOG_ERR("Buffered request has unknown type.");
}
/* We are done sending this request. */
if (conn->cursor == 0) {
LL_DELETE(conn->transfer_queue, buf);
free(buf);
}
}
void process_data_chunk(event_loop *loop,
int data_sock,
void *context,
int events) {
LOG_DEBUG("Reading data");
ssize_t r, s;
client_connection *conn = (client_connection *) context;
plasma_request_buffer *buf = conn->transfer_queue;
CHECK(buf != NULL);
/* Try to read one BUFSIZE at a time. */
s = buf->data_size + buf->metadata_size - conn->cursor;
if (s > BUFSIZE) {
s = BUFSIZE;
}
r = read(data_sock, buf->data + conn->cursor, s);
if (r == -1) {
LOG_ERR("read error");
} else if (r == 0) {
LOG_DEBUG("end of file");
} else {
conn->cursor += r;
}
if (conn->cursor != buf->data_size + buf->metadata_size) {
/* If we haven't finished reading all the data for this object yet, we're
* done for now. */
return;
}
/* Seal the object and release it. The release corresponds to the call to
* plasma_create that occurred in process_data_request. */
LOG_DEBUG("reading on channel %d finished", data_sock);
plasma_seal(conn->manager_state->plasma_conn, buf->object_id);
plasma_release(conn->manager_state->plasma_conn, buf->object_id);
/* Notify any clients who were waiting on a fetch to this object. */
client_object_connection *object_conn, *next;
client_connection *client_conn;
HASH_FIND(fetch_hh, conn->manager_state->fetch_connections, &(buf->object_id),
sizeof(buf->object_id), object_conn);
plasma_reply reply = {.object_id = buf->object_id, .has_object = 1};
while (object_conn) {
next = object_conn->next;
client_conn = object_conn->client_conn;
send_client_reply(client_conn, &reply);
event_loop_remove_timer(client_conn->manager_state->loop,
object_conn->timer);
remove_object_connection(client_conn, object_conn);
object_conn = next;
}
/* Remove the request buffer used for reading this object's data. */
LL_DELETE(conn->transfer_queue, buf);
free(buf);
/* Switch to listening for requests from this socket, instead of reading
* object data. */
event_loop_remove_file(loop, data_sock);
event_loop_add_file(loop, data_sock, EVENT_LOOP_READ, process_message, conn);
}
client_connection *get_manager_connection(plasma_manager_state *state,
const char *ip_addr,
int port) {
/* TODO(swang): Should probably check whether ip_addr and port belong to us.
*/
UT_string *ip_addr_port;
utstring_new(ip_addr_port);
utstring_printf(ip_addr_port, "%s:%d", ip_addr, port);
client_connection *manager_conn;
HASH_FIND_STR(state->manager_connections, utstring_body(ip_addr_port),
manager_conn);
LOG_DEBUG("Getting manager connection to %s on DB client %d",
utstring_body(ip_addr_port), get_client_id(state->db));
if (!manager_conn) {
/* If we don't already have a connection to this manager, start one. */
manager_conn = malloc(sizeof(client_connection));
manager_conn->fd = plasma_manager_connect(ip_addr, port);
manager_conn->manager_state = state;
manager_conn->transfer_queue = NULL;
manager_conn->cursor = 0;
manager_conn->ip_addr_port = strdup(utstring_body(ip_addr_port));
HASH_ADD_KEYPTR(hh, manager_conn->manager_state->manager_connections,
manager_conn->ip_addr_port,
strlen(manager_conn->ip_addr_port), manager_conn);
}
utstring_free(ip_addr_port);
return manager_conn;
}
void process_transfer_request(event_loop *loop,
object_id object_id,
uint8_t addr[4],
int port,
client_connection *conn) {
uint8_t *data;
int64_t data_size;
uint8_t *metadata;
int64_t metadata_size;
/* TODO(swang): A non-blocking plasma_get, or else we could block here
* forever if we don't end up sealing this object. */
/* The corresponding call to plasma_release will happen in
* write_object_chunk. */
plasma_get(conn->manager_state->plasma_conn, object_id, &data_size, &data,
&metadata_size, &metadata);
assert(metadata == data + data_size);
plasma_request_buffer *buf = malloc(sizeof(plasma_request_buffer));
buf->type = PLASMA_DATA;
buf->object_id = object_id;
buf->data = data; /* We treat this as a pointer to the
concatenated data and metadata. */
buf->data_size = data_size;
buf->metadata_size = metadata_size;
UT_string *ip_addr;
utstring_new(ip_addr);
utstring_printf(ip_addr, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]);
client_connection *manager_conn =
get_manager_connection(conn->manager_state, utstring_body(ip_addr), port);
utstring_free(ip_addr);
if (manager_conn->transfer_queue == NULL) {
/* If we already have a connection to this manager and its inactive,
* (re)register it with the event loop again. */
event_loop_add_file(loop, manager_conn->fd, EVENT_LOOP_WRITE,
send_queued_request, manager_conn);
}
/* Add this transfer request to this connection's transfer queue. */
LL_APPEND(manager_conn->transfer_queue, buf);
}
void process_data_request(event_loop *loop,
int client_sock,
object_id object_id,
int64_t data_size,
int64_t metadata_size,
client_connection *conn) {
plasma_request_buffer *buf = malloc(sizeof(plasma_request_buffer));
buf->object_id = object_id;
buf->data_size = data_size;
buf->metadata_size = metadata_size;
/* The corresponding call to plasma_release should happen in
* process_data_chunk. */
plasma_create(conn->manager_state->plasma_conn, object_id, data_size, NULL,
metadata_size, &(buf->data));
LL_APPEND(conn->transfer_queue, buf);
conn->cursor = 0;
/* Switch to reading the data from this socket, instead of listening for
* other requests. */
event_loop_remove_file(loop, client_sock);
event_loop_add_file(loop, client_sock, EVENT_LOOP_READ, process_data_chunk,
conn);
}
/**
* Request a transfer for the given object ID from the next manager believed to
* have a copy. Adds the request for this object ID to the queue of outgoing
* requests to the manager we want to try.
*
* @param client_conn The context for the connection to this client.
* @param object_id The object ID we want to request a transfer of.
* @returns Void.
*/
void request_transfer_from(client_connection *client_conn,
object_id object_id) {
client_object_connection *object_conn =
get_object_connection(client_conn, object_id);
CHECK(object_conn);
CHECK(object_conn->manager_count > 0);
char addr[16];
int port;
int i = object_conn->num_retries % object_conn->manager_count;
parse_ip_addr_port(object_conn->manager_vector[i], addr, &port);
client_connection *manager_conn =
get_manager_connection(client_conn->manager_state, addr, port);
plasma_request_buffer *transfer_request =
malloc(sizeof(plasma_request_buffer));
transfer_request->type = PLASMA_TRANSFER;
transfer_request->object_id = object_conn->object_id;
if (manager_conn->transfer_queue == NULL) {
/* If we already have a connection to this manager and its inactive,
* (re)register it with the event loop. */
event_loop_add_file(client_conn->manager_state->loop, manager_conn->fd,
EVENT_LOOP_WRITE, send_queued_request, manager_conn);
}
/* Add this transfer request to this connection's transfer queue. */
LL_APPEND(manager_conn->transfer_queue, transfer_request);
}
int manager_timeout_handler(event_loop *loop, timer_id id, void *context) {
client_object_connection *object_conn = context;
client_connection *client_conn = object_conn->client_conn;
LOG_DEBUG("Timer went off, %d tries left", object_conn->num_retries);
if (object_conn->num_retries > 0) {
request_transfer_from(client_conn, object_conn->object_id);
object_conn->num_retries--;
return MANAGER_TIMEOUT;
}
plasma_reply reply = {.object_id = object_conn->object_id, .has_object = 0};
send_client_reply(client_conn, &reply);
remove_object_connection(client_conn, object_conn);
return AE_NOMORE;
}
/**
* Given an object ID and the managers it can be found on, start requesting a
* transfer from the managers.
*
* @param object_id The object ID we want to request a transfer of.
* @param manager_count The number of managers the object can be found on.
* @param manager_vector A vector of the IP addresses of the managers that the
* object can be found on.
* @param context The context for the connection to this client.
*
* Initializes a new context for this client and object. Managers are tried in
* order until we receive the data or we timeout and run out of retries.
*/
void request_transfer(object_id object_id,
int manager_count,
const char *manager_vector[],
void *context) {
client_connection *client_conn = (client_connection *) context;
client_object_connection *object_conn =
get_object_connection(client_conn, object_id);
CHECK(object_conn);
LOG_DEBUG("Object is on %d managers", manager_count);
if (manager_count == 0) {
/* TODO(swang): Instead of immediately counting this as a failure, maybe
* register a Redis callback for changes to this object table entry. */
free(manager_vector);
plasma_reply reply = {.object_id = object_conn->object_id, .has_object = 0};
send_client_reply(client_conn, &reply);
remove_object_connection(client_conn, object_conn);
return;
}
/* Pick a different manager to request a transfer from on every attempt. */
object_conn->manager_count = manager_count;
object_conn->manager_vector = malloc(manager_count * sizeof(char *));
memset(object_conn->manager_vector, 0, manager_count * sizeof(char *));
for (int i = 0; i < manager_count; ++i) {
int len = strlen(manager_vector[i]);
object_conn->manager_vector[i] = malloc(len + 1);
strncpy(object_conn->manager_vector[i], manager_vector[i], len);
object_conn->manager_vector[i][len] = '\0';
}
free(manager_vector);
/* Wait for the object data for the default number of retries, which timeout
* after a default interval. */
object_conn->num_retries = NUM_RETRIES;
object_conn->timer =
event_loop_add_timer(client_conn->manager_state->loop, MANAGER_TIMEOUT,
manager_timeout_handler, object_conn);
request_transfer_from(client_conn, object_id);
}
void process_fetch_request(client_connection *client_conn,
object_id object_id) {
plasma_reply reply = {.object_id = object_id};
if (client_conn->manager_state->db == NULL) {
reply.has_object = 0;
send_client_reply(client_conn, &reply);
return;
}
/* Return success immediately if we already have this object. */
int is_local = 0;
plasma_contains(client_conn->manager_state->plasma_conn, object_id,
&is_local);
if (is_local) {
reply.has_object = 1;
send_client_reply(client_conn, &reply);
return;
}
/* Register the new context with the current client connection. */
client_object_connection *object_conn =
add_object_connection(client_conn, object_id);
if (!object_conn) {
LOG_DEBUG("Unable to allocate memory for object context.");
reply.has_object = 0;
send_client_reply(client_conn, &reply);
}
/* Request a transfer from a plasma manager that has this object. */
object_table_lookup(client_conn->manager_state->db, object_id,
request_transfer, client_conn);
}
void process_fetch_requests(client_connection *client_conn,
int num_object_ids,
object_id object_ids[]) {
for (int i = 0; i < num_object_ids; ++i) {
client_conn->num_return_objects++;
process_fetch_request(client_conn, object_ids[i]);
}
}
void process_message(event_loop *loop,
int client_sock,
void *context,
int events) {
client_connection *conn = (client_connection *) context;
int64_t type;
int64_t length;
plasma_request *req;
read_message(client_sock, &type, &length, (uint8_t **) &req);
switch (type) {
case PLASMA_TRANSFER:
process_transfer_request(loop, req->object_ids[0], req->addr, req->port,
conn);
break;
case PLASMA_DATA:
LOG_DEBUG("Starting to stream data");
process_data_request(loop, client_sock, req->object_ids[0], req->data_size,
req->metadata_size, conn);
break;
case PLASMA_FETCH:
LOG_DEBUG("Processing fetch");
process_fetch_requests(conn, req->num_object_ids, req->object_ids);
break;
case PLASMA_SEAL:
LOG_DEBUG("Publishing to object table from DB client %d.",
get_client_id(conn->manager_state->db));
object_table_add(conn->manager_state->db, req->object_ids[0]);
break;
case DISCONNECT_CLIENT: {
LOG_INFO("Disconnecting client on fd %d", client_sock);
/* TODO(swang): Check if this connection was to a plasma manager. If so,
* delete it. */
event_loop_remove_file(loop, client_sock);
close(client_sock);
free(conn);
} break;
default:
LOG_ERR("invalid request %" PRId64, type);
exit(-1);
}
free(req);
}
void new_client_connection(event_loop *loop,
int listener_sock,
void *context,
int events) {
int new_socket = accept_client(listener_sock);
/* Create a new data connection context per client. */
client_connection *conn = malloc(sizeof(client_connection));
conn->manager_state = (plasma_manager_state *) context;
conn->transfer_queue = NULL;
conn->fd = new_socket;
conn->active_objects = NULL;
conn->num_return_objects = 0;
event_loop_add_file(loop, new_socket, EVENT_LOOP_READ, process_message, conn);
LOG_DEBUG("New plasma manager connection with fd %d", new_socket);
}
void start_server(const char *store_socket_name,
const char *master_addr,
int port,
const char *db_addr,
int db_port) {
int sock = bind_inet_sock(port);
CHECKM(sock >= 0, "Unable to bind to manager port");
g_manager_state = init_plasma_manager_state(store_socket_name, master_addr,
port, db_addr, db_port);
CHECK(g_manager_state);
LOG_DEBUG("Started server connected to store %s, listening on port %d",
store_socket_name, port);
event_loop_add_file(g_manager_state->loop, sock, EVENT_LOOP_READ,
new_client_connection, g_manager_state);
event_loop_run(g_manager_state->loop);
}
/* Report "success" to valgrind. */
void signal_handler(int signal) {
if (signal == SIGTERM) {
if (g_manager_state) {
db_disconnect(g_manager_state->db);
}
exit(0);
}
}
int main(int argc, char *argv[]) {
signal(SIGTERM, signal_handler);
/* Socket name of the plasma store this manager is connected to. */
char *store_socket_name = NULL;
/* IP address of this node. */
char *master_addr = NULL;
/* Port number the manager should use. */
int port;
/* IP address and port of state database. */
char *db_host = NULL;
int c;
while ((c = getopt(argc, argv, "s:m:p:d:")) != -1) {
switch (c) {
case 's':
store_socket_name = optarg;
break;
case 'm':
master_addr = optarg;
break;
case 'p':
port = atoi(optarg);
break;
case 'd':
db_host = optarg;
break;
default:
LOG_ERR("unknown option %c", c);
exit(-1);
}
}
if (!store_socket_name) {
LOG_ERR(
"please specify socket for connecting to the plasma store with -s "
"switch");
exit(-1);
}
if (!master_addr) {
LOG_ERR(
"please specify ip address of the current host in the format "
"123.456.789.10 with -m switch");
exit(-1);
}
char db_addr[16];
int db_port;
if (db_host) {
parse_ip_addr_port(db_host, db_addr, &db_port);
start_server(store_socket_name, master_addr, port, db_addr, db_port);
} else {
start_server(store_socket_name, master_addr, port, NULL, 0);
}
}
+135
View File
@@ -0,0 +1,135 @@
#ifndef PLASMA_MANAGER_H
#define PLASMA_MANAGER_H
#include <poll.h>
#include "utarray.h"
typedef struct client_connection client_connection;
/**
* Process a request from another object store manager to transfer an object.
*
* @param loop This is the event loop of the plasma manager.
* @param object_id The object_id of the object we will be sending.
* @param addr The IP address of the plasma manager we are sending the object
* to.
* @param port The port of the plasma manager we are sending the object to.
* @param conn The client_connection to the other plasma manager.
* @return Void.
*
* This establishes a connection to the remote manager if one doesn't already
* exist, and queues up the request to transfer the data to the other object
* manager.
*/
void process_transfer(event_loop *loop,
object_id object_id,
uint8_t addr[4],
int port,
client_connection *conn);
/**
* Process a request from another object store manager to receive data.
*
* @param loop This is the event loop of the plasma manager.
* @param client_sock The connection to the other plasma manager.
* @param object_id The object_id of the object we will be reading.
* @param data_size Size of the object.
* @param metadata_size Size of the metadata.
* @param conn The client_connection to the other plasma manager.
* @return Void.
*
* Initializes the object we are going to write to in the local plasma store
* and then switches the data socket to read the raw object bytes instead of
* plasma requests.
*/
void process_data(event_loop *loop,
int client_sock,
object_id object_id,
int64_t data_size,
int64_t metadata_size,
client_connection *conn);
/**
* Read the next chunk of the object in transit from the plasma manager
* connected to the given socket. Once all data for this object has been read,
* the socket switches to listening for the next plasma request.
*
* @param loop This is the event loop of the plasma manager.
* @param data_sock The connection to the other plasma manager.
* @param context The client_connection to the other plasma manager.
* @return Void.
*/
void process_data_chunk(event_loop *loop,
int data_sock,
void *context,
int events);
/**
* Process a fetch request. The fetch request tries:
* 1) If there is no connection to the database, return faliure to the client.
* 2) If the object is available locally, return success to the client.
* 3) Query the database for plasma managers that the object might be on.
* 4) Request a transfer from each of the managers that the object might be on
* until we receive the data, or until we timeout.
* 5) Returns success or failure to the client depending on whether we received
* the data or not.
*
* @param client_conn The connection context for the client that made the
* request.
* @param object_id The object ID requested.
* @return Void.
*/
void process_fetch_request(client_connection *client_conn, object_id object_id);
/**
* Process a fetch request for multiple objects. The success of each object
* will be written back individually to the socket connected to the client that
* made the request in a plasma_reply. See documentation for
* process_fetch_request for the sequence of operations per object.
*
* @param client_conn The connection context for the client that made the
* request.
* @param object_id_count The number of object IDs requested.
* @param object_ids[] The vector of object IDs requested.
* @return Void.
*/
void process_fetch_requests(client_connection *client_conn,
int object_id_count,
object_id object_ids[]);
/**
* Send the next request queued for the other plasma manager connected to the
* socket "data_sock". This could be a request to either write object data or
* request object data. If the request is to write object data and no data has
* been sent yet, the initial handshake to transfer the object size is
* performed.
*
* @param loop This is the event loop of the plasma manager.
* @param data_sock This is the socket the other plasma manager is listening on.
* @param context The client_connection to the other plasma manager, contains a
* queue of objects that will be sent.
* @return Void.
*/
void send_queued_request(event_loop *loop,
int data_sock,
void *context,
int events);
/**
* Register a new client connection with the plasma manager. A client can
* either be a worker or another plasma manager.
*
* @param loop This is the event loop of the plasma manager.
* @param listener_socket The socket the plasma manager is listening on.
* @param context The plasma manager state.
* @return Void.
*/
void new_client_connection(event_loop *loop,
int listener_sock,
void *context,
int events);
/* The buffer size in bytes. Data will get transfered in multiples of this */
#define BUFSIZE 4096
#endif /* PLASMA_MANAGER_H */
+520
View File
@@ -0,0 +1,520 @@
/* PLASMA STORE: This is a simple object store server process
*
* It accepts incoming client connections on a unix domain socket
* (name passed in via the -s option of the executable) and uses a
* single thread to serve the clients. Each client establishes a
* connection and can create objects, wait for objects and seal
* objects through that connection.
*
* It keeps a hash table that maps object_ids (which are 20 byte long,
* just enough to store and SHA1 hash) to memory mapped files. */
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/un.h>
#include <getopt.h>
#include <string.h>
#include <signal.h>
#include <limits.h>
#include <poll.h>
#include "common.h"
#include "event_loop.h"
#include "io.h"
#include "uthash.h"
#include "utarray.h"
#include "fling.h"
#include "malloc.h"
#include "plasma_store.h"
void *dlmalloc(size_t);
void dlfree(void *);
/**
* This is used by the Plasma Store to send a reply to the Plasma Client.
*/
void plasma_send_reply(int fd, plasma_reply *reply) {
int reply_count = sizeof(plasma_reply);
if (write(fd, reply, reply_count) != reply_count) {
LOG_ERR("write error, fd = %d", fd);
exit(-1);
}
}
typedef struct {
/* Object id of this object. */
object_id object_id;
/* Object info like size, creation time and owner. */
plasma_object_info info;
/* Memory mapped file containing the object. */
int fd;
/* Size of the underlying map. */
int64_t map_size;
/* Offset from the base of the mmap. */
ptrdiff_t offset;
/* Handle for the uthash table. */
UT_hash_handle handle;
/* Pointer to the object data. Needed to free the object. */
uint8_t *pointer;
/** An array of the clients that are currently using this object. */
UT_array *clients;
} object_table_entry;
typedef struct {
/* Object id of this object. */
object_id object_id;
/* An array of the clients that are waiting to get this object. */
UT_array *waiting_clients;
/* Handle for the uthash table. */
UT_hash_handle handle;
} object_notify_entry;
/** Contains all information that is associated with a client. */
struct client {
/** The socket used to communicate with the client. */
int sock;
/** A pointer to the global plasma state. */
plasma_store_state *plasma_state;
};
/* This is used to define the array of clients used to define the
* object_table_entry type. */
UT_icd client_icd = {sizeof(client *), NULL, NULL, NULL};
/* This is used to define the array of object IDs used to define the
* notification_queue type. */
UT_icd object_table_entry_icd = {sizeof(object_id), NULL, NULL, NULL};
typedef struct {
/** Client file descriptor. This is used as a key for the hash table. */
int subscriber_fd;
/** The object IDs to notify the client about. We notify the client about the
* IDs in the order that the objects were sealed. */
UT_array *object_ids;
/** Handle for the uthash table. */
UT_hash_handle hh;
} notification_queue;
struct plasma_store_state {
/* Event loop of the plasma store. */
event_loop *loop;
/* Objects that are still being written by their owner process. */
object_table_entry *open_objects;
/* Objects that have already been sealed by their owner process and
* can now be shared with other processes. */
object_table_entry *sealed_objects;
/* Objects that processes are waiting for. */
object_notify_entry *objects_notify;
/** The pending notifications that have not been sent to subscribers because
* the socket send buffers were full. This is a hash table from client file
* descriptor to an array of object_ids to send to that client. */
notification_queue *pending_notifications;
};
plasma_store_state *init_plasma_store(event_loop *loop) {
plasma_store_state *state = malloc(sizeof(plasma_store_state));
state->loop = loop;
state->open_objects = NULL;
state->sealed_objects = NULL;
state->objects_notify = NULL;
state->pending_notifications = NULL;
return state;
}
/* If this client is not already using the object, add the client to the
* object's list of clients, otherwise do nothing. */
void add_client_to_object_clients(object_table_entry *entry,
client *client_info) {
/* Check if this client is already using the object. */
for (int i = 0; i < utarray_len(entry->clients); ++i) {
client **c = (client **) utarray_eltptr(entry->clients, i);
if (*c == client_info) {
return;
}
}
/* Add the client pointer to the list of clients using this object. */
utarray_push_back(entry->clients, &client_info);
}
/* Create a new object buffer in the hash table. */
void create_object(client *client_context,
object_id object_id,
int64_t data_size,
int64_t metadata_size,
plasma_object *result) {
LOG_DEBUG("creating object"); /* TODO(pcm): add object_id here */
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *entry;
/* TODO(swang): Return these error to the client instead of exiting. */
HASH_FIND(handle, plasma_state->open_objects, &object_id, sizeof(object_id),
entry);
CHECKM(entry == NULL, "Cannot create object twice.");
HASH_FIND(handle, plasma_state->sealed_objects, &object_id, sizeof(object_id),
entry);
CHECKM(entry == NULL, "Cannot create object twice.");
uint8_t *pointer = dlmalloc(data_size + metadata_size);
int fd;
int64_t map_size;
ptrdiff_t offset;
get_malloc_mapinfo(pointer, &fd, &map_size, &offset);
assert(fd != -1);
entry = malloc(sizeof(object_table_entry));
memcpy(&entry->object_id, &object_id, sizeof(object_id));
entry->info.data_size = data_size;
entry->info.metadata_size = metadata_size;
entry->pointer = pointer;
/* TODO(pcm): set the other fields */
entry->fd = fd;
entry->map_size = map_size;
entry->offset = offset;
utarray_new(entry->clients, &client_icd);
HASH_ADD(handle, plasma_state->open_objects, object_id, sizeof(object_id),
entry);
result->handle.store_fd = fd;
result->handle.mmap_size = map_size;
result->data_offset = offset;
result->metadata_offset = offset + data_size;
result->data_size = data_size;
result->metadata_size = metadata_size;
/* Record that this client is using this object. */
add_client_to_object_clients(entry, client_context);
}
/* Get an object from the hash table. */
int get_object(client *client_context,
int conn,
object_id object_id,
plasma_object *result) {
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *entry;
HASH_FIND(handle, plasma_state->sealed_objects, &object_id, sizeof(object_id),
entry);
if (entry) {
result->handle.store_fd = entry->fd;
result->handle.mmap_size = entry->map_size;
result->data_offset = entry->offset;
result->metadata_offset = entry->offset + entry->info.data_size;
result->data_size = entry->info.data_size;
result->metadata_size = entry->info.metadata_size;
/* If necessary, record that this client is using this object. In the case
* where entry == NULL, this will be called from seal_object. */
add_client_to_object_clients(entry, client_context);
return OBJECT_FOUND;
} else {
object_notify_entry *notify_entry;
LOG_DEBUG("object not in hash table of sealed objects");
HASH_FIND(handle, plasma_state->objects_notify, &object_id,
sizeof(object_id), notify_entry);
if (!notify_entry) {
notify_entry = malloc(sizeof(object_notify_entry));
memset(notify_entry, 0, sizeof(object_notify_entry));
utarray_new(notify_entry->waiting_clients, &client_icd);
memcpy(&notify_entry->object_id, &object_id, sizeof(object_id));
HASH_ADD(handle, plasma_state->objects_notify, object_id,
sizeof(object_id), notify_entry);
}
utarray_push_back(notify_entry->waiting_clients, &client_context);
}
return OBJECT_NOT_FOUND;
}
int remove_client_from_object_clients(object_table_entry *entry,
client *client_info) {
/* Find the location of the client in the array. */
for (int i = 0; i < utarray_len(entry->clients); ++i) {
client **c = (client **) utarray_eltptr(entry->clients, i);
if (*c == client_info) {
/* Remove the client from the array. */
utarray_erase(entry->clients, i, 1);
/* Return 1 to indicate that the client was removed. */
return 1;
}
}
/* Return 0 to indicate that the client was not removed. */
return 0;
}
void release_object(client *client_context, object_id object_id) {
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *open_entry;
object_table_entry *sealed_entry;
HASH_FIND(handle, plasma_state->open_objects, &object_id, sizeof(object_id),
open_entry);
HASH_FIND(handle, plasma_state->sealed_objects, &object_id, sizeof(object_id),
sealed_entry);
/* Exactly one of open_entry and sealed_entry should be NULL. */
CHECK((open_entry == NULL) != (sealed_entry == NULL));
/* Remove the client from the object's array of clients. */
if (open_entry != NULL) {
CHECK(remove_client_from_object_clients(open_entry, client_context) == 1);
} else {
CHECK(remove_client_from_object_clients(sealed_entry, client_context) == 1);
}
}
/* Check if an object is present. */
int contains_object(client *client_context, object_id object_id) {
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *entry;
HASH_FIND(handle, plasma_state->sealed_objects, &object_id, sizeof(object_id),
entry);
return entry ? OBJECT_FOUND : OBJECT_NOT_FOUND;
}
/* Seal an object that has been created in the hash table. */
void seal_object(client *client_context, object_id object_id) {
LOG_DEBUG("sealing object"); // TODO(pcm): add object_id here
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *entry;
HASH_FIND(handle, plasma_state->open_objects, &object_id, sizeof(object_id),
entry);
CHECK(entry != NULL);
/* Move the object table entry from the table of open objects to the table of
* sealed objects. */
HASH_DELETE(handle, plasma_state->open_objects, entry);
HASH_ADD(handle, plasma_state->sealed_objects, object_id, sizeof(object_id),
entry);
/* Inform all subscribers that a new object has been sealed. */
notification_queue *queue, *temp_queue;
HASH_ITER(hh, plasma_state->pending_notifications, queue, temp_queue) {
utarray_push_back(queue->object_ids, &object_id);
send_notifications(plasma_state->loop, queue->subscriber_fd, plasma_state,
0);
}
/* Inform processes getting this object that the object is ready now. */
object_notify_entry *notify_entry;
HASH_FIND(handle, plasma_state->objects_notify, &object_id, sizeof(object_id),
notify_entry);
if (notify_entry) {
plasma_reply reply;
memset(&reply, 0, sizeof(reply));
plasma_object *result = &reply.object;
result->handle.store_fd = entry->fd;
result->handle.mmap_size = entry->map_size;
result->data_offset = entry->offset;
result->metadata_offset = entry->offset + entry->info.data_size;
result->data_size = entry->info.data_size;
result->metadata_size = entry->info.metadata_size;
HASH_DELETE(handle, plasma_state->objects_notify, notify_entry);
/* Send notifications to the clients that were waiting for this object. */
for (int i = 0; i < utarray_len(notify_entry->waiting_clients); ++i) {
client **c = (client **) utarray_eltptr(notify_entry->waiting_clients, i);
send_fd((*c)->sock, reply.object.handle.store_fd, (char *) &reply,
sizeof(reply));
/* Record that the client is using this object. */
add_client_to_object_clients(entry, *c);
}
utarray_free(notify_entry->waiting_clients);
free(notify_entry);
}
}
/* Delete an object that has been created in the hash table. */
void delete_object(client *client_context, object_id object_id) {
LOG_DEBUG("deleting object"); // TODO(rkn): add object_id here
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *entry;
HASH_FIND(handle, plasma_state->sealed_objects, &object_id, sizeof(object_id),
entry);
/* TODO(rkn): This should probably not fail, but should instead throw an
* error. Maybe we should also support deleting objects that have been created
* but not sealed. */
CHECKM(entry != NULL, "To delete an object it must have been sealed.");
CHECKM(utarray_len(entry->clients) == 0,
"To delete an object, there must be no clients currently using it.");
uint8_t *pointer = entry->pointer;
HASH_DELETE(handle, plasma_state->sealed_objects, entry);
dlfree(pointer);
utarray_free(entry->clients);
free(entry);
}
/* Send more notifications to a subscriber. */
void send_notifications(event_loop *loop,
int client_sock,
void *context,
int events) {
plasma_store_state *plasma_state = context;
notification_queue *queue;
HASH_FIND_INT(plasma_state->pending_notifications, &client_sock, queue);
CHECK(queue != NULL);
int num_processed = 0;
/* Loop over the array of pending notifications and send as many of them as
* possible. */
for (int i = 0; i < utarray_len(queue->object_ids); ++i) {
object_id *obj_id = (object_id *) utarray_eltptr(queue->object_ids, i);
/* Attempt to send a notification about this object ID. */
int nbytes = send(client_sock, obj_id, sizeof(object_id), 0);
if (nbytes >= 0) {
CHECK(nbytes == sizeof(object_id));
} else if (nbytes == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
LOG_DEBUG(
"The socket's send buffer is full, so we are caching this "
"notification and will send it later.");
break;
} else {
CHECKM(0, "This code should be unreachable.");
}
num_processed += 1;
}
/* Remove the sent notifications from the array. */
utarray_erase(queue->object_ids, 0, num_processed);
}
/* Subscribe to notifications about sealed objects. */
void subscribe_to_updates(client *client_context, int conn) {
LOG_DEBUG("subscribing to updates");
plasma_store_state *plasma_state = client_context->plasma_state;
char dummy;
int fd = recv_fd(conn, &dummy, 1);
CHECKM(HASH_CNT(handle, plasma_state->open_objects) == 0,
"plasma_subscribe should be called before any objects are created.");
CHECKM(HASH_CNT(handle, plasma_state->sealed_objects) == 0,
"plasma_subscribe should be called before any objects are created.");
/* Create a new array to buffer notifications that can't be sent to the
* subscriber yet because the socket send buffer is full. TODO(rkn): the queue
* never gets freed. */
notification_queue *queue =
(notification_queue *) malloc(sizeof(notification_queue));
queue->subscriber_fd = fd;
utarray_new(queue->object_ids, &object_table_entry_icd);
HASH_ADD_INT(plasma_state->pending_notifications, subscriber_fd, queue);
/* Add a callback to the event loop to send queued notifications whenever
* there is room in the socket's send buffer. */
event_loop_add_file(plasma_state->loop, fd, EVENT_LOOP_WRITE,
send_notifications, plasma_state);
}
void process_message(event_loop *loop,
int client_sock,
void *context,
int events) {
client *client_context = context;
int64_t type;
int64_t length;
plasma_request *req;
read_message(client_sock, &type, &length, (uint8_t **) &req);
/* We're only sending a single object ID at a time for now. */
plasma_reply reply;
memset(&reply, 0, sizeof(reply));
/* Process the different types of requests. */
switch (type) {
case PLASMA_CREATE:
create_object(client_context, req->object_ids[0], req->data_size,
req->metadata_size, &reply.object);
send_fd(client_sock, reply.object.handle.store_fd, (char *) &reply,
sizeof(reply));
break;
case PLASMA_GET:
if (get_object(client_context, client_sock, req->object_ids[0],
&reply.object) == OBJECT_FOUND) {
send_fd(client_sock, reply.object.handle.store_fd, (char *) &reply,
sizeof(reply));
}
break;
case PLASMA_RELEASE:
release_object(client_context, req->object_ids[0]);
break;
case PLASMA_CONTAINS:
if (contains_object(client_context, req->object_ids[0]) == OBJECT_FOUND) {
reply.has_object = 1;
}
plasma_send_reply(client_sock, &reply);
break;
case PLASMA_SEAL:
seal_object(client_context, req->object_ids[0]);
break;
case PLASMA_DELETE:
delete_object(client_context, req->object_ids[0]);
break;
case PLASMA_SUBSCRIBE:
subscribe_to_updates(client_context, client_sock);
break;
case DISCONNECT_CLIENT: {
LOG_DEBUG("Disconnecting client on fd %d", client_sock);
event_loop_remove_file(loop, client_sock);
/* If this client was using any objects, remove it from the appropriate
* lists. */
plasma_store_state *plasma_state = client_context->plasma_state;
object_table_entry *entry, *temp_entry;
HASH_ITER(handle, plasma_state->open_objects, entry, temp_entry) {
remove_client_from_object_clients(entry, client_context);
}
HASH_ITER(handle, plasma_state->sealed_objects, entry, temp_entry) {
remove_client_from_object_clients(entry, client_context);
}
} break;
default:
/* This code should be unreachable. */
CHECK(0);
}
free(req);
}
void new_client_connection(event_loop *loop,
int listener_sock,
void *context,
int events) {
plasma_store_state *plasma_state = context;
int new_socket = accept_client(listener_sock);
/* Create a new client object. This will also be used as the context to use
* for events on this client's socket. TODO(rkn): free this somewhere. */
client *client_context = (client *) malloc(sizeof(client));
client_context->sock = new_socket;
client_context->plasma_state = plasma_state;
/* Add a callback to handle events on this socket. */
event_loop_add_file(loop, new_socket, EVENT_LOOP_READ, process_message,
client_context);
LOG_DEBUG("new connection with fd %d", new_socket);
}
/* Report "success" to valgrind. */
void signal_handler(int signal) {
if (signal == SIGTERM) {
exit(0);
}
}
void start_server(char *socket_name) {
int socket = bind_ipc_sock(socket_name);
CHECK(socket >= 0);
event_loop *loop = event_loop_create();
plasma_store_state *state = init_plasma_store(loop);
event_loop_add_file(loop, socket, EVENT_LOOP_READ, new_client_connection,
state);
event_loop_run(loop);
}
int main(int argc, char *argv[]) {
signal(SIGTERM, signal_handler);
char *socket_name = NULL;
int c;
while ((c = getopt(argc, argv, "s:")) != -1) {
switch (c) {
case 's':
socket_name = optarg;
break;
default:
exit(-1);
}
}
if (!socket_name) {
LOG_ERR("please specify socket for incoming connections with -s switch");
exit(-1);
}
LOG_DEBUG("starting server listening on %s", socket_name);
start_server(socket_name);
}
+97
View File
@@ -0,0 +1,97 @@
#ifndef PLASMA_STORE_H
#define PLASMA_STORE_H
#include "plasma.h"
typedef struct client client;
typedef struct plasma_store_state plasma_store_state;
/**
* Create a new object. The client must do a call to release_object to tell the
* store when it is done with the object.
*
* @param client_context The context of the client making this request.
* @param object_id Object ID of the object to be created.
* @param data_size Size in bytes of the object to be created.
* @param metadata_size Size in bytes of the object metadata.
* @return Void.
*/
void create_object(client *client_context,
object_id object_id,
int64_t data_size,
int64_t metadata_size,
plasma_object *result);
/**
* Get an object. This method assumes that we currently have or will eventually
* have this object sealed. If the object has not yet been sealed, the client
* that requested the object will be notified when it is sealed.
*
* For each call to get_object, the client must do a call to release_object to
* tell the store when it is done with the object.
*
* @param client_context The context of the client making this request.
* @param conn The client connection that requests the object.
* @param object_id Object ID of the object to be gotten.
* @return The status of the object (object_status in plasma.h).
*/
int get_object(client *client_context,
int conn,
object_id object_id,
plasma_object *result);
/**
* Record the fact that a particular client is no longer using an object.
*
* @param client_context The context of the client making this request.
* @param object_id The object ID of the object that is being released.
* @param Void.
*/
void release_object(client *client_context, object_id object_id);
/**
* Seal an object. The object is now immutable and can be accessed with get.
*
* @param client_context The context of the client making this request.
* @param object_id Object ID of the object to be sealed.
* @return Void.
*/
void seal_object(client *client_context, object_id object_id);
/**
* Check if the plasma store contains an object:
*
* @param client_context The context of the client making this request.
* @param object_id Object ID that will be checked.
* @return OBJECT_FOUND if the object is in the store, OBJECT_NOT_FOUND if not
*/
int contains_object(client *client_context, object_id object_id);
/**
* Delete an object from the plasma store:
*
* @param client_context The context of the client making this request.
* @param object_id Object ID of the object to be deleted.
* @return Void.
*/
void delete_object(client *client_context, object_id object_id);
/**
* Send notifications about sealed objects to the subscribers. This is called
* in seal_object. If the socket's send buffer is full, the notification will be
* buffered, and this will be called again when the send buffer has room.
*
* @param loop The Plasma store event loop.
* @param client_sock The socket of the client to send the notification to.
* @param plasma_state The plasma store global state.
* @param events This is needed for this function to have the signature of a
callback.
* @return Void.
*/
void send_notifications(event_loop *loop,
int client_sock,
void *plasma_state,
int events);
#endif /* PLASMA_STORE_H */
+396
View File
@@ -0,0 +1,396 @@
from __future__ import print_function
import os
import signal
import socket
import struct
import subprocess
import sys
import unittest
import random
import time
import tempfile
import plasma
USE_VALGRIND = False
def random_object_id():
return "".join([chr(random.randint(0, 255)) for _ in range(plasma.PLASMA_ID_SIZE)])
def generate_metadata(length):
metadata = length * ["\x00"]
if length > 0:
metadata[0] = chr(random.randint(0, 255))
metadata[-1] = chr(random.randint(0, 255))
for _ in range(100):
metadata[random.randint(0, length - 1)] = chr(random.randint(0, 255))
return buffer("".join(metadata))
def write_to_data_buffer(buff, length):
if length > 0:
buff[0] = chr(random.randint(0, 255))
buff[-1] = chr(random.randint(0, 255))
for _ in range(100):
buff[random.randint(0, length - 1)] = chr(random.randint(0, 255))
def create_object(client, data_size, metadata_size, seal=True):
object_id = random_object_id()
metadata = generate_metadata(metadata_size)
memory_buffer = client.create(object_id, data_size, metadata)
write_to_data_buffer(memory_buffer, data_size)
if seal:
client.seal(object_id)
return object_id, memory_buffer, metadata
def assert_get_object_equal(unit_test, client1, client2, object_id, memory_buffer=None, metadata=None):
if memory_buffer is not None:
unit_test.assertEqual(memory_buffer[:], client2.get(object_id)[:])
if metadata is not None:
unit_test.assertEqual(metadata[:], client2.get_metadata(object_id)[:])
unit_test.assertEqual(client1.get(object_id)[:], client2.get(object_id)[:])
unit_test.assertEqual(client1.get_metadata(object_id)[:],
client2.get_metadata(object_id)[:])
class TestPlasmaClient(unittest.TestCase):
def setUp(self):
# Start Plasma.
plasma_store_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../build/plasma_store")
store_name = "/tmp/store{}".format(random.randint(0, 10000))
command = [plasma_store_executable, "-s", store_name]
if USE_VALGRIND:
self.p = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full"] + command)
time.sleep(2.0)
else:
self.p = subprocess.Popen(command)
# Connect to Plasma.
self.plasma_client = plasma.PlasmaClient(store_name)
def tearDown(self):
# Kill the plasma store process.
if USE_VALGRIND:
self.p.send_signal(signal.SIGTERM)
self.p.wait()
if self.p.returncode != 0:
os._exit(-1)
else:
self.p.kill()
def test_create(self):
# Create an object id string.
object_id = random_object_id()
# Create a new buffer and write to it.
length = 50
memory_buffer = self.plasma_client.create(object_id, length)
for i in range(length):
memory_buffer[i] = chr(i % 256)
# Seal the object.
self.plasma_client.seal(object_id)
# Get the object.
memory_buffer = self.plasma_client.get(object_id)
for i in range(length):
self.assertEqual(memory_buffer[i], chr(i % 256))
def test_create_with_metadata(self):
for length in range(1000):
# Create an object id string.
object_id = random_object_id()
# Create a random metadata string.
metadata = generate_metadata(length)
# Create a new buffer and write to it.
memory_buffer = self.plasma_client.create(object_id, length, metadata)
for i in range(length):
memory_buffer[i] = chr(i % 256)
# Seal the object.
self.plasma_client.seal(object_id)
# Get the object.
memory_buffer = self.plasma_client.get(object_id)
for i in range(length):
self.assertEqual(memory_buffer[i], chr(i % 256))
# Get the metadata.
metadata_buffer = self.plasma_client.get_metadata(object_id)
self.assertEqual(len(metadata), len(metadata_buffer))
for i in range(len(metadata)):
self.assertEqual(metadata[i], metadata_buffer[i])
def test_contains(self):
fake_object_ids = [random_object_id() for _ in range(100)]
real_object_ids = [random_object_id() for _ in range(100)]
for object_id in real_object_ids:
self.assertFalse(self.plasma_client.contains(object_id))
memory_buffer = self.plasma_client.create(object_id, 100)
self.plasma_client.seal(object_id)
self.assertTrue(self.plasma_client.contains(object_id))
for object_id in fake_object_ids:
self.assertFalse(self.plasma_client.contains(object_id))
for object_id in real_object_ids:
self.assertTrue(self.plasma_client.contains(object_id))
# def test_individual_delete(self):
# length = 100
# # Create an object id string.
# object_id = random_object_id()
# # Create a random metadata string.
# metadata = generate_metadata(100)
# # Create a new buffer and write to it.
# memory_buffer = self.plasma_client.create(object_id, length, metadata)
# for i in range(length):
# memory_buffer[i] = chr(i % 256)
# # Seal the object.
# self.plasma_client.seal(object_id)
# # Check that the object is present.
# self.assertTrue(self.plasma_client.contains(object_id))
# # Delete the object.
# self.plasma_client.delete(object_id)
# # Make sure the object is no longer present.
# self.assertFalse(self.plasma_client.contains(object_id))
#
# def test_delete(self):
# # Create some objects.
# object_ids = [random_object_id() for _ in range(100)]
# for object_id in object_ids:
# length = 100
# # Create a random metadata string.
# metadata = generate_metadata(100)
# # Create a new buffer and write to it.
# memory_buffer = self.plasma_client.create(object_id, length, metadata)
# for i in range(length):
# memory_buffer[i] = chr(i % 256)
# # Seal the object.
# self.plasma_client.seal(object_id)
# # Check that the object is present.
# self.assertTrue(self.plasma_client.contains(object_id))
#
# # Delete the objects and make sure they are no longer present.
# for object_id in object_ids:
# # Delete the object.
# self.plasma_client.delete(object_id)
# # Make sure the object is no longer present.
# self.assertFalse(self.plasma_client.contains(object_id))
def test_illegal_functionality(self):
# Create an object id string.
object_id = random_object_id()
# Create a new buffer and write to it.
length = 1000
memory_buffer = self.plasma_client.create(object_id, length)
# Make sure we cannot access memory out of bounds.
self.assertRaises(Exception, lambda : memory_buffer[length])
# Seal the object.
self.plasma_client.seal(object_id)
# This test is commented out because it currently fails.
# # Make sure the object is ready only now.
# def illegal_assignment():
# memory_buffer[0] = chr(0)
# self.assertRaises(Exception, illegal_assignment)
# Get the object.
memory_buffer = self.plasma_client.get(object_id)
# Make sure the object is read only.
def illegal_assignment():
memory_buffer[0] = chr(0)
self.assertRaises(Exception, illegal_assignment)
def test_subscribe(self):
# Subscribe to notifications from the Plasma Store.
sock = self.plasma_client.subscribe()
for i in [1, 10, 100, 1000, 10000, 100000]:
object_ids = [random_object_id() for _ in range(i)]
for object_id in object_ids:
# Create an object and seal it to trigger a notification.
self.plasma_client.create(object_id, 1000)
self.plasma_client.seal(object_id)
# Check that we received notifications for all of the objects.
for object_id in object_ids:
message_data = self.plasma_client.get_next_notification()
self.assertEqual(object_id, message_data)
class TestPlasmaManager(unittest.TestCase):
def setUp(self):
# Start two PlasmaStores.
plasma_store_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../build/plasma_store")
store_name1 = "/tmp/store{}".format(random.randint(0, 10000))
store_name2 = "/tmp/store{}".format(random.randint(0, 10000))
plasma_store_command1 = [plasma_store_executable, "-s", store_name1]
plasma_store_command2 = [plasma_store_executable, "-s", store_name2]
if USE_VALGRIND:
self.p2 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--error-exitcode=1"] + plasma_store_command1)
self.p3 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--error-exitcode=1"] + plasma_store_command2)
else:
self.p2 = subprocess.Popen(plasma_store_command1)
self.p3 = subprocess.Popen(plasma_store_command2)
# Start a Redis server.
redis_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../common/thirdparty/redis-3.2.3/src/redis-server")
self.redis_process = None
manager_redis_args = []
if os.path.exists(redis_path):
redis_port = 6379
with open(os.devnull, 'w') as FNULL:
self.redis_process = subprocess.Popen([redis_path,
"--port", str(redis_port)],
stdout=FNULL)
time.sleep(0.1)
manager_redis_args = ["-d", "{addr}:{port}".format(addr="127.0.0.1",
port=redis_port)]
# Start two PlasmaManagers.
self.port1 = random.randint(10000, 50000)
self.port2 = random.randint(10000, 50000)
plasma_manager_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../build/plasma_manager")
plasma_manager_command1 = [plasma_manager_executable,
"-s", store_name1,
"-m", "127.0.0.1",
"-p", str(self.port1)] + manager_redis_args
plasma_manager_command2 = [plasma_manager_executable,
"-s", store_name2,
"-m", "127.0.0.1",
"-p", str(self.port2)] + manager_redis_args
if USE_VALGRIND:
self.p4 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--error-exitcode=1"] + plasma_manager_command1)
self.p5 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--error-exitcode=1"] + plasma_manager_command2)
time.sleep(2.0)
else:
self.p4 = subprocess.Popen(plasma_manager_command1)
self.p5 = subprocess.Popen(plasma_manager_command2)
time.sleep(0.1)
# Connect two PlasmaClients.
self.client1 = plasma.PlasmaClient(store_name1, "127.0.0.1", self.port1)
self.client2 = plasma.PlasmaClient(store_name2, "127.0.0.1", self.port2)
time.sleep(0.5)
def tearDown(self):
# Kill the PlasmaStore and PlasmaManager processes.
if USE_VALGRIND:
self.p4.send_signal(signal.SIGTERM)
self.p4.wait()
self.p5.send_signal(signal.SIGTERM)
self.p5.wait()
self.p2.send_signal(signal.SIGTERM)
self.p2.wait()
self.p3.send_signal(signal.SIGTERM)
self.p3.wait()
if self.p2.returncode != 0 or self.p3.returncode != 0 or self.p4.returncode != 0 or self.p5.returncode != 0:
print("aborting due to valgrind error")
os._exit(-1)
else:
self.p2.kill()
self.p3.kill()
self.p4.kill()
self.p5.kill()
if self.redis_process:
self.redis_process.kill()
def test_fetch(self):
if self.redis_process is None:
print("Cannot test fetch without a running redis instance.")
self.assertTrue(False)
for _ in range(100):
# Create an object.
object_id1, memory_buffer1, metadata1 = create_object(self.client1, 2000, 2000)
# Fetch the object from the other plasma store.
# TODO(swang): This line is a hack! It makes sure that the entry will be
# in the object table once we call the fetch operation. Remove once
# retries are implemented by Ray common.
time.sleep(0.1)
successes = self.client2.fetch([object_id1])
self.assertEqual(successes, [True])
# Compare the two buffers.
assert_get_object_equal(self, self.client1, self.client2, object_id1,
memory_buffer=memory_buffer1, metadata=metadata1)
# Fetch in the other direction. These should return quickly because
# client1 already has the object.
successes = self.client1.fetch([object_id1])
self.assertEqual(successes, [True])
assert_get_object_equal(self, self.client2, self.client1, object_id1,
memory_buffer=memory_buffer1, metadata=metadata1)
def test_fetch_multiple(self):
if self.redis_process is None:
print("Cannot test fetch without a running redis instance.")
self.assertTrue(False)
for _ in range(20):
# Create two objects and a third fake one that doesn't exist.
object_id1, memory_buffer1, metadata1 = create_object(self.client1, 2000, 2000)
missing_object_id = random_object_id()
object_id2, memory_buffer2, metadata2 = create_object(self.client1, 2000, 2000)
object_ids = [object_id1, missing_object_id, object_id2]
# Fetch the objects from the other plasma store. The second object ID
# should timeout since it does not exist.
# TODO(swang): This line is a hack! It makes sure that the entry will be
# in the object table once we call the fetch operation. Remove once
# retries are implemented by Ray common.
time.sleep(0.1)
successes = self.client2.fetch(object_ids)
self.assertEqual(successes, [True, False, True])
# Compare the buffers of the objects that do exist.
assert_get_object_equal(self, self.client1, self.client2, object_id1,
memory_buffer=memory_buffer1, metadata=metadata1)
assert_get_object_equal(self, self.client1, self.client2, object_id2,
memory_buffer=memory_buffer2, metadata=metadata2)
# Fetch in the other direction. The fake object still does not exist.
successes = self.client1.fetch(object_ids)
self.assertEqual(successes, [True, False, True])
assert_get_object_equal(self, self.client2, self.client1, object_id1,
memory_buffer=memory_buffer1, metadata=metadata1)
assert_get_object_equal(self, self.client2, self.client1, object_id2,
memory_buffer=memory_buffer2, metadata=metadata2)
def test_transfer(self):
for _ in range(100):
# Create an object.
object_id1, memory_buffer1, metadata1 = create_object(self.client1, 2000, 2000)
# Transfer the buffer to the the other PlasmaStore.
self.client1.transfer("127.0.0.1", self.port2, object_id1)
# Compare the two buffers.
assert_get_object_equal(self, self.client1, self.client2, object_id1,
memory_buffer=memory_buffer1, metadata=metadata1)
# # Transfer the buffer again.
# self.client1.transfer("127.0.0.1", self.port2, object_id1)
# # Compare the two buffers.
# assert_get_object_equal(self, self.client1, self.client2, object_id1,
# memory_buffer=memory_buffer1, metadata=metadata1)
# Create an object.
object_id2, memory_buffer2, metadata2 = create_object(self.client2, 20000, 20000)
# Transfer the buffer to the the other PlasmaStore.
self.client2.transfer("127.0.0.1", self.port1, object_id2)
# Compare the two buffers.
assert_get_object_equal(self, self.client1, self.client2, object_id2,
memory_buffer=memory_buffer2, metadata=metadata2)
def test_illegal_functionality(self):
# Create an object id string.
object_id = random_object_id()
# Create a new buffer.
# memory_buffer = self.client1.create(object_id, 20000)
# This test is commented out because it currently fails.
# # Transferring the buffer before sealing it should fail.
# self.assertRaises(Exception, lambda : self.manager1.transfer(1, object_id))
def test_stresstest(self):
a = time.time()
object_ids = []
for i in range(10000): # TODO(pcm): increase this to 100000
object_id = random_object_id()
object_ids.append(object_id)
self.client1.create(object_id, 1)
self.client1.seal(object_id)
for object_id in object_ids:
self.client1.transfer("127.0.0.1", self.port2, object_id)
b = time.time() - a
print("it took", b, "seconds to put and transfer the objects")
if __name__ == "__main__":
if len(sys.argv) > 1:
# pop the argument so we don't mess with unittest's own argument parser
if sys.argv[-1] == "valgrind":
arg = sys.argv.pop()
USE_VALGRIND = True
print("Using valgrind for tests")
unittest.main(verbosity=2)
+6280
View File
File diff suppressed because it is too large Load Diff
-870
View File
@@ -1,870 +0,0 @@
// TODO: - Implement other datatypes for ndarray
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <Python.h>
#include <structmember.h>
#define PY_ARRAY_UNIQUE_SYMBOL RAYLIB_ARRAY_API
#include <numpy/arrayobject.h>
#include <iostream>
#include "types.pb.h"
#include "worker.h"
#include "utils.h"
RayConfig global_ray_config;
extern "C" {
static int PyObjectToWorker(PyObject* object, Worker **worker);
// Object references
typedef struct {
PyObject_HEAD
ObjectID id;
// We give the PyObjectID object a reference to the worker capsule object to
// make sure that the worker capsule does not go out of scope until all of the
// object references have gone out of scope. The reason for this is that the
// worker capsule destructor destroys the worker object. If the worker object
// has been destroyed, then when the object reference tries to call
// worker->decrement_reference_count, we can get a segfault.
PyObject* worker_capsule;
} PyObjectID;
static void PyObjectID_dealloc(PyObjectID *self) {
Worker* worker;
PyObjectToWorker(self->worker_capsule, &worker);
std::vector<ObjectID> objectids;
objectids.push_back(self->id);
RAY_LOG(RAY_REFCOUNT, "In PyObjectID_dealloc, calling decrement_reference_count for objectid " << self->id);
worker->decrement_reference_count(objectids);
Py_DECREF(self->worker_capsule); // The corresponding increment happens in PyObjectID_init.
self->ob_type->tp_free((PyObject*) self);
}
static PyObject* PyObjectID_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
PyObjectID* self = (PyObjectID*) type->tp_alloc(type, 0);
if (self != NULL) {
self->id = 0;
}
return (PyObject*) self;
}
static int PyObjectID_init(PyObjectID *self, PyObject *args, PyObject *kwds) {
if (!PyArg_ParseTuple(args, "iO", &self->id, &self->worker_capsule)) {
return -1;
}
Worker* worker;
PyObjectToWorker(self->worker_capsule, &worker);
Py_INCREF(self->worker_capsule); // The corresponding decrement happens in PyObjectID_dealloc.
std::vector<ObjectID> objectids;
objectids.push_back(self->id);
RAY_LOG(RAY_REFCOUNT, "In PyObjectID_init, calling increment_reference_count for objectid " << objectids[0]);
worker->increment_reference_count(objectids);
return 0;
};
static int PyObjectID_compare(PyObject* a, PyObject* b) {
PyObjectID* A = (PyObjectID*) a;
PyObjectID* B = (PyObjectID*) b;
if (A->id < B->id) {
return -1;
}
if (A->id > B->id) {
return 1;
}
return 0;
}
static long PyObjectID_hash(PyObject* a) {
PyObjectID* A = (PyObjectID*) a;
PyObject* tuple = PyTuple_New(1);
PyTuple_SetItem(tuple, 0, PyInt_FromLong(A->id));
long hash = PyObject_Hash(tuple);
Py_XDECREF(tuple);
return hash;
}
char RAY_ID_LITERAL[] = "id";
char RAY_OBJECT_ID_LITERAL[] = "object id";
static PyMemberDef PyObjectID_members[] = {
{RAY_ID_LITERAL, T_INT, offsetof(PyObjectID, id), 0, RAY_OBJECT_ID_LITERAL},
{NULL}
};
static PyTypeObject PyObjectIDType = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
"ray.ObjectID", /* tp_name */
sizeof(PyObjectID), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)PyObjectID_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
PyObjectID_compare, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
PyObjectID_hash, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT, /* tp_flags */
"Ray objects", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
0, /* tp_methods */
PyObjectID_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)PyObjectID_init, /* tp_init */
0, /* tp_alloc */
PyObjectID_new, /* tp_new */
};
// create PyObjectID from C++ (could be made more efficient if neccessary)
PyObject* make_pyobjectid(PyObject* worker_capsule, ObjectID objectid) {
PyObject* arglist = Py_BuildValue("(iO)", objectid, worker_capsule);
PyObject* result = PyObject_CallObject((PyObject*) &PyObjectIDType, arglist);
Py_DECREF(arglist);
return result;
}
// Error handling
static PyObject *RayError;
static PyObject *RaySizeError;
// Pass arguments from Python to C++
static int PyObjectToTask(PyObject* object, Task **task) {
if (PyCapsule_IsValid(object, "task")) {
*task = static_cast<Task*>(PyCapsule_GetPointer(object, "task"));
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be a 'task' capsule");
return 0;
}
}
static int PyObjectToObj(PyObject* object, Obj **obj) {
if (PyCapsule_IsValid(object, "obj")) {
*obj = static_cast<Obj*>(PyCapsule_GetPointer(object, "obj"));
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be a 'obj' capsule");
return 0;
}
}
static int PyObjectToWorker(PyObject* object, Worker **worker) {
if (PyCapsule_IsValid(object, "worker")) {
*worker = static_cast<Worker*>(PyCapsule_GetPointer(object, "worker"));
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be a 'worker' capsule");
return 0;
}
}
static int PyObjectToObjectID(PyObject* object, ObjectID *objectid) {
if (PyObject_IsInstance(object, (PyObject*)&PyObjectIDType)) {
*objectid = ((PyObjectID*) object)->id;
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be an object reference");
return 0;
}
}
// Destructors
static void ObjCapsule_Destructor(PyObject* capsule) {
Obj* obj = static_cast<Obj*>(PyCapsule_GetPointer(capsule, "obj"));
delete obj;
}
static void WorkerCapsule_Destructor(PyObject* capsule) {
Worker* obj = static_cast<Worker*>(PyCapsule_GetPointer(capsule, "worker"));
delete obj;
}
static void TaskCapsule_Destructor(PyObject* capsule) {
Task* obj = static_cast<Task*>(PyCapsule_GetPointer(capsule, "task"));
delete obj;
}
// Helper methods
// Pass ownership of both the key and the value to the PyDict.
// This is only required for PyDicts, not for PyLists or PyTuples, compare
// https://docs.python.org/2/c-api/dict.html
// https://docs.python.org/2/c-api/list.html
// https://docs.python.org/2/c-api/tuple.html
void set_dict_item_and_transfer_ownership(PyObject* dict, PyObject* key, PyObject* val) {
PyDict_SetItem(dict, key, val);
Py_XDECREF(key);
Py_XDECREF(val);
}
// This converts an Python ObjectID to an Python integer.
static PyObject* serialize_objectid(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
if (!PyArg_ParseTuple(args, "O&O&", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid)) {
return NULL;
}
return PyInt_FromLong(objectid);
}
// This converts a Python integer to a Python ObjectID.
static PyObject* deserialize_objectid(PyObject* self, PyObject* args) {
PyObject* worker_capsule;
int objectid;
if (!PyArg_ParseTuple(args, "Oi", &worker_capsule, &objectid)) {
return NULL;
}
return make_pyobjectid(worker_capsule, static_cast<ObjectID>(objectid));
}
static PyObject* allocate_buffer(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
SegmentId segmentid;
long size;
if (!PyArg_ParseTuple(args, "O&O&l", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid, &size)) {
return NULL;
}
void* address = reinterpret_cast<void*>(const_cast<char*>(worker->allocate_buffer(objectid, size, segmentid)));
std::vector<npy_intp> dim({size});
PyObject* t = PyTuple_New(2);
PyTuple_SetItem(t, 0, PyArray_SimpleNewFromData(1, dim.data(), NPY_BYTE, address));
PyTuple_SetItem(t, 1, PyInt_FromLong(segmentid));
return t;
}
static PyObject* finish_buffer(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
long segmentid;
long metadata_offset;
if (!PyArg_ParseTuple(args, "O&O&ll", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid, &segmentid, &metadata_offset)) {
return NULL;
}
return worker->finish_buffer(objectid, segmentid, metadata_offset);
}
static PyObject* get_buffer(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
int64_t size;
SegmentId segmentid;
int64_t metadata_offset;
if (!PyArg_ParseTuple(args, "O&O&", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid)) {
return NULL;
}
void* address = reinterpret_cast<void*>(const_cast<char*>(worker->get_buffer(objectid, size, segmentid, metadata_offset)));
std::vector<npy_intp> dim({static_cast<npy_intp>(size)});
PyObject* t = PyTuple_New(3);
PyTuple_SetItem(t, 0, PyArray_SimpleNewFromData(1, dim.data(), NPY_BYTE, address));
PyTuple_SetItem(t, 1, PyInt_FromLong(segmentid));
PyTuple_SetItem(t, 2, PyInt_FromLong(metadata_offset));
return t;
}
static PyObject* is_arrow(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
if (!PyArg_ParseTuple(args, "O&O&", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid)) {
return NULL;
}
if (worker->is_arrow(objectid))
Py_RETURN_TRUE;
else
Py_RETURN_FALSE;
}
static PyObject* unmap_object(PyObject* self, PyObject* args) {
Worker* worker;
int segmentid;
if (!PyArg_ParseTuple(args, "O&i", &PyObjectToWorker, &worker, &segmentid)) {
return NULL;
}
worker->unmap_object(segmentid);
Py_RETURN_NONE;
}
static PyObject* serialize_task(PyObject* self, PyObject* args) {
PyObject* worker_capsule;
Task* task = new Task(); // TODO: to be freed in capsule destructor
char* name;
int len;
PyObject* arguments;
if (!PyArg_ParseTuple(args, "Os#O", &worker_capsule, &name, &len, &arguments)) {
return NULL;
}
task->set_name(std::string(name, len));
std::vector<ObjectID> objectids; // This is a vector of all the objectids that are serialized in this task, including objectids that are contained in Python objects that are passed by value.
if (PyList_Check(arguments)) {
for (size_t i = 0, size = PyList_Size(arguments); i < size; ++i) {
PyObject* element = PyList_GetItem(arguments, i);
if (PyObject_IsInstance(element, (PyObject*)&PyObjectIDType)) {
// Handle the case where the argument to the task is an ObjectID.
ObjectID objectid = ((PyObjectID*) element)->id;
task->add_arg()->set_objectid(objectid);
objectids.push_back(objectid);
} else if (PyString_CheckExact(element)) {
// Handle the case where the argument to the task is being passed by
// value and we receive an argument serialized as a string here.
char* buffer;
Py_ssize_t length;
PyString_AsStringAndSize(element, &buffer, &length);
task->add_arg()->set_serialized_arg(std::string(buffer, length));
} else {
RAY_CHECK(false, "This code should be unreachable.");
}
}
} else {
PyErr_SetString(RayError, "serialize_task: second argument needs to be a list");
return NULL;
}
Worker* worker;
PyObjectToWorker(worker_capsule, &worker);
if (objectids.size() > 0) {
RAY_LOG(RAY_REFCOUNT, "In serialize_task, calling increment_reference_count for contained objectids");
worker->increment_reference_count(objectids);
}
std::string output;
task->SerializeToString(&output);
int task_size = output.length();
return PyCapsule_New(static_cast<void*>(task), "task", &TaskCapsule_Destructor);
}
static PyObject* deserialize_task(PyObject* worker_capsule, const Task& task) {
std::vector<ObjectID> objectids; // This is a vector of all the objectids that were serialized in this task, including objectids that are contained in Python objects that are passed by value.
PyObject* string = PyString_FromStringAndSize(task.name().c_str(), task.name().size());
int argsize = task.arg_size();
PyObject* arglist = PyList_New(argsize);
for (int i = 0; i < argsize; ++i) {
if (task.arg(i).serialized_arg().empty()) {
PyList_SetItem(arglist, i, make_pyobjectid(worker_capsule, task.arg(i).objectid()));
objectids.push_back(task.arg(i).objectid());
} else {
PyObject* serialized_arg = PyString_FromStringAndSize(task.arg(i).serialized_arg().data(), task.arg(i).serialized_arg().size());
PyList_SetItem(arglist, i, serialized_arg);
}
}
Worker* worker;
PyObjectToWorker(worker_capsule, &worker);
worker->decrement_reference_count(objectids);
int resultsize = task.result_size();
std::vector<ObjectID> result_objectids;
PyObject* resultlist = PyList_New(resultsize);
for (int i = 0; i < resultsize; ++i) {
PyList_SetItem(resultlist, i, make_pyobjectid(worker_capsule, task.result(i)));
result_objectids.push_back(task.result(i));
}
worker->decrement_reference_count(result_objectids); // The corresponding increment is done in SubmitTask in the scheduler.
PyObject* t = PyTuple_New(3); // We set the items of the tuple using PyTuple_SetItem, because that transfers ownership to the tuple.
PyTuple_SetItem(t, 0, string);
PyTuple_SetItem(t, 1, arglist);
PyTuple_SetItem(t, 2, resultlist);
return t;
}
// Ray Python API
static PyObject* create_worker(PyObject* self, PyObject* args) {
const char* node_ip_address;
const char* scheduler_address;
// The object store address can be the empty string, in which case the
// scheduler will choose the object store address.
const char* objstore_address;
int mode;
const char* log_file_name;
if (!PyArg_ParseTuple(args, "sssis", &node_ip_address, &scheduler_address, &objstore_address, &mode, &log_file_name)) {
return NULL;
}
// Set the logging file.
create_log_dir_or_die(log_file_name);
global_ray_config.log_to_file = true;
global_ray_config.logfile.open(log_file_name);
// Create the worker.
bool is_driver = (mode != Mode::WORKER_MODE);
Worker* worker = new Worker(std::string(node_ip_address), std::string(scheduler_address), static_cast<Mode>(mode));
// Register the worker.
worker->register_worker(std::string(node_ip_address), std::string(objstore_address), is_driver);
PyObject* t = PyTuple_New(2);
PyObject* worker_capsule = PyCapsule_New(static_cast<void*>(worker), "worker", &WorkerCapsule_Destructor);
PyTuple_SetItem(t, 0, worker_capsule);
PyTuple_SetItem(t, 1, PyString_FromString(worker->get_worker_address()));
return t;
}
static PyObject* disconnect(PyObject* self, PyObject* args) {
Worker* worker;
if (!PyArg_ParseTuple(args, "O&", &PyObjectToWorker, &worker)) {
return NULL;
}
worker->disconnect();
Py_RETURN_NONE;
}
static PyObject* connected(PyObject* self, PyObject* args) {
Worker* worker;
if (!PyArg_ParseTuple(args, "O&", &PyObjectToWorker, &worker)) {
return NULL;
}
if (worker->connected()) {
Py_RETURN_TRUE;
}
Py_RETURN_FALSE;
}
static PyObject* wait_for_next_message(PyObject* self, PyObject* args) {
PyObject* worker_capsule;
if (!PyArg_ParseTuple(args, "O", &worker_capsule)) {
return NULL;
}
Worker* worker;
PyObjectToWorker(worker_capsule, &worker);
if (std::unique_ptr<WorkerMessage> message = worker->receive_next_message()) {
bool task_present = !message->task().name().empty();
bool function_present = !message->function().implementation().empty();
bool reusable_variable_present = !message->reusable_variable().name().empty();
bool function_to_run_present = !message->function_to_run().implementation().empty();
RAY_CHECK(task_present + function_present + reusable_variable_present + function_to_run_present <= 1, "The worker message should contain at most one item.");
PyObject* t = PyTuple_New(2);
if (task_present) {
PyTuple_SetItem(t, 0, PyString_FromString("task"));
PyTuple_SetItem(t, 1, deserialize_task(worker_capsule, message->task()));
} else if (function_present) {
PyTuple_SetItem(t, 0, PyString_FromString("function"));
PyObject* remote_function_data = PyTuple_New(2);
PyTuple_SetItem(remote_function_data, 0, PyString_FromStringAndSize(message->function().name().data(), static_cast<ssize_t>(message->function().name().size())));
PyTuple_SetItem(remote_function_data, 1, PyString_FromStringAndSize(message->function().implementation().data(), static_cast<ssize_t>(message->function().implementation().size())));
PyTuple_SetItem(t, 1, remote_function_data);
} else if (reusable_variable_present) {
PyTuple_SetItem(t, 0, PyString_FromString("reusable_variable"));
PyObject* reusable_variable = PyTuple_New(3);
PyTuple_SetItem(reusable_variable, 0, PyString_FromStringAndSize(message->reusable_variable().name().data(), static_cast<ssize_t>(message->reusable_variable().name().size())));
PyTuple_SetItem(reusable_variable, 1, PyString_FromStringAndSize(message->reusable_variable().initializer().implementation().data(), static_cast<ssize_t>(message->reusable_variable().initializer().implementation().size())));
PyTuple_SetItem(reusable_variable, 2, PyString_FromStringAndSize(message->reusable_variable().reinitializer().implementation().data(), static_cast<ssize_t>(message->reusable_variable().reinitializer().implementation().size())));
PyTuple_SetItem(t, 1, reusable_variable);
} else if (function_to_run_present) {
PyTuple_SetItem(t, 0, PyString_FromString("function_to_run"));
PyTuple_SetItem(t, 1, PyString_FromStringAndSize(message->function_to_run().implementation().data(), static_cast<ssize_t>(message->function_to_run().implementation().size())));
} else {
PyTuple_SetItem(t, 0, PyString_FromString("die"));
Py_INCREF(Py_None);
PyTuple_SetItem(t, 1, Py_None);
}
return t;
}
RAY_CHECK(false, "This code should be unreachable.");
Py_RETURN_NONE;
}
static PyObject* run_function_on_all_workers(PyObject* self, PyObject* args) {
Worker* worker;
const char* function;
int function_size;
if (!PyArg_ParseTuple(args, "O&s#", &PyObjectToWorker, &worker, &function, &function_size)) {
return NULL;
}
worker->run_function_on_all_workers(std::string(function, static_cast<size_t>(function_size)));
Py_RETURN_NONE;
}
static PyObject* export_remote_function(PyObject* self, PyObject* args) {
Worker* worker;
const char* function_name;
const char* function;
int function_size;
if (!PyArg_ParseTuple(args, "O&ss#", &PyObjectToWorker, &worker, &function_name, &function, &function_size)) {
return NULL;
}
if (worker->export_remote_function(std::string(function_name), std::string(function, static_cast<size_t>(function_size)))) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
}
static PyObject* export_reusable_variable(PyObject* self, PyObject* args) {
Worker* worker;
const char* name;
int name_size;
const char* initializer;
int initializer_size;
const char* reinitializer;
int reinitializer_size;
if (!PyArg_ParseTuple(args, "O&s#s#s#", &PyObjectToWorker, &worker, &name, &name_size, &initializer, &initializer_size, &reinitializer, &reinitializer_size)) {
return NULL;
}
std::string name_str(name, static_cast<size_t>(name_size));
std::string initializer_str(initializer, static_cast<size_t>(initializer_size));
std::string reinitializer_str(reinitializer, static_cast<size_t>(reinitializer_size));
worker->export_reusable_variable(name_str, initializer_str, reinitializer_str);
Py_RETURN_NONE;
}
static PyObject* submit_task(PyObject* self, PyObject* args) {
PyObject* worker_capsule;
Task* task;
if (!PyArg_ParseTuple(args, "OO&", &worker_capsule, &PyObjectToTask, &task)) {
return NULL;
}
Worker* worker;
PyObjectToWorker(worker_capsule, &worker);
SubmitTaskRequest request;
request.set_allocated_task(task);
SubmitTaskReply reply = worker->submit_task(&request);
request.release_task(); // TODO: Make sure that task is not moved, otherwise capsule pointer needs to be updated
if (reply.no_workers()) {
PyErr_SetString(RayError, "No workers have registered with the scheduler, so this function cannot be run.");
return NULL;
}
if (!reply.function_registered()) {
PyErr_SetString(RayError, "No worker has registered this function with the scheduler.");
return NULL;
}
int size = reply.result_size();
PyObject* list = PyList_New(size);
std::vector<ObjectID> result_objectids;
for (int i = 0; i < size; ++i) {
PyList_SetItem(list, i, make_pyobjectid(worker_capsule, reply.result(i)));
result_objectids.push_back(reply.result(i));
}
worker->decrement_reference_count(result_objectids); // The corresponding increment is done in SubmitTask in the scheduler.
return list;
}
static PyObject* ready_for_new_task(PyObject* self, PyObject* args) {
Worker* worker;
if (!PyArg_ParseTuple(args, "O&", &PyObjectToWorker, &worker)) {
return NULL;
}
worker->ready_for_new_task();
Py_RETURN_NONE;
}
static PyObject* register_remote_function(PyObject* self, PyObject* args) {
Worker* worker;
const char* function_name;
int num_return_vals;
if (!PyArg_ParseTuple(args, "O&si", &PyObjectToWorker, &worker, &function_name, &num_return_vals)) {
return NULL;
}
worker->register_remote_function(std::string(function_name), num_return_vals);
Py_RETURN_NONE;
}
static PyObject* notify_failure(PyObject* self, PyObject* args) {
Worker* worker;
const char* name;
const char* error_message;
int type;
if (!PyArg_ParseTuple(args, "O&ssi", &PyObjectToWorker, &worker, &name, &error_message, &type)) {
return NULL;
}
worker->notify_failure(static_cast<FailedType>(type), std::string(name), std::string(error_message));
Py_RETURN_NONE;
}
static PyObject* get_objectid(PyObject* self, PyObject* args) {
PyObject* worker_capsule;
if (!PyArg_ParseTuple(args, "O", &worker_capsule)) {
return NULL;
}
Worker* worker;
PyObjectToWorker(worker_capsule, &worker);
ObjectID objectid = worker->get_objectid();
return make_pyobjectid(worker_capsule, objectid);
}
static PyObject* add_contained_objectids(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
PyObject* contained_objectids;
if (!PyArg_ParseTuple(args, "O&O&O", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid, &contained_objectids)) {
return NULL;
}
RAY_CHECK(PyList_Check(contained_objectids), "The contained_objectids argument must be a list.")
std::vector<ObjectID> vec_contained_objectids;
size_t size = PyList_Size(contained_objectids);
for (size_t i = 0; i < size; ++i) {
ObjectID contained_objectid;
PyObjectToObjectID(PyList_GetItem(contained_objectids, i), &contained_objectid);
vec_contained_objectids.push_back(contained_objectid);
}
worker->add_contained_objectids(objectid, vec_contained_objectids);
Py_RETURN_NONE;
}
static PyObject* request_object(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID objectid;
if (!PyArg_ParseTuple(args, "O&O&", &PyObjectToWorker, &worker, &PyObjectToObjectID, &objectid)) {
return NULL;
}
worker->request_object(objectid);
Py_RETURN_NONE;
}
static PyObject* wait(PyObject* self, PyObject* args) {
Worker* worker;
PyObject* objectids;
if (!PyArg_ParseTuple(args, "O&O", &PyObjectToWorker, &worker, &objectids)) {
return NULL;
}
std::vector<ObjectID> objectids_vec;
for (size_t i = 0; i < PyList_Size(objectids); ++i) {
ObjectID objectid;
PyObjectToObjectID(PyList_GetItem(objectids, i), &objectid);
objectids_vec.push_back(objectid);
}
std::vector<int> indices = worker->wait(objectids_vec);
PyObject* result = PyList_New(indices.size());
for (size_t i = 0; i < indices.size(); ++i) {
PyList_SetItem(result, i, PyInt_FromLong(indices[i]));
}
return result;
}
static PyObject* alias_objectids(PyObject* self, PyObject* args) {
Worker* worker;
ObjectID alias_objectid;
ObjectID target_objectid;
if (!PyArg_ParseTuple(args, "O&O&O&", &PyObjectToWorker, &worker, &PyObjectToObjectID, &alias_objectid, &PyObjectToObjectID, &target_objectid)) {
return NULL;
}
worker->alias_objectids(alias_objectid, target_objectid);
Py_RETURN_NONE;
}
static PyObject* scheduler_info(PyObject* self, PyObject* args) {
Worker* worker;
if (!PyArg_ParseTuple(args, "O&", &PyObjectToWorker, &worker)) {
return NULL;
}
ClientContext context;
SchedulerInfoRequest request;
SchedulerInfoReply reply;
worker->scheduler_info(context, request, reply);
// Unpack the target object reference information.
PyObject* target_objectid_list = PyList_New(reply.target_objectid_size());
for (size_t i = 0; i < reply.target_objectid_size(); ++i) {
PyList_SetItem(target_objectid_list, i, PyInt_FromLong(reply.target_objectid(i)));
}
// Unpack the reference count information.
PyObject* reference_count_list = PyList_New(reply.reference_count_size());
for (size_t i = 0; i < reply.reference_count_size(); ++i) {
PyList_SetItem(reference_count_list, i, PyInt_FromLong(reply.reference_count(i)));
}
// Unpack the available worker information.
PyObject* available_worker_list = PyList_New(reply.avail_worker_size());
for (size_t i = 0; i < reply.avail_worker_size(); ++i) {
PyList_SetItem(available_worker_list, i, PyInt_FromLong(reply.avail_worker(i)));
}
// Unpack the object store information.
PyObject* objstore_list = PyList_New(reply.objstore_size());
for (size_t i = 0; i < reply.objstore_size(); ++i) {
PyObject* objstore_data = PyDict_New();
set_dict_item_and_transfer_ownership(objstore_data, PyString_FromString("objstoreid"), PyInt_FromLong(reply.objstore(i).objstoreid()));
set_dict_item_and_transfer_ownership(objstore_data, PyString_FromString("address"), PyString_FromStringAndSize(reply.objstore(i).address().data(), reply.objstore(i).address().size()));
PyList_SetItem(objstore_list, i, objstore_data);
}
// Store the unpacked values in a dictionary to return.
PyObject* dict = PyDict_New();
set_dict_item_and_transfer_ownership(dict, PyString_FromString("target_objectids"), target_objectid_list);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("reference_counts"), reference_count_list);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("available_workers"), available_worker_list);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("objstores"), objstore_list);
return dict;
}
static PyObject* failure_to_dict(const Failure& failure) {
PyObject* failure_dict = PyDict_New();
set_dict_item_and_transfer_ownership(failure_dict, PyString_FromString("workerid"), PyInt_FromLong(failure.workerid()));
set_dict_item_and_transfer_ownership(failure_dict, PyString_FromString("worker_address"), PyString_FromStringAndSize(failure.worker_address().data(), failure.worker_address().size()));
set_dict_item_and_transfer_ownership(failure_dict, PyString_FromString("function_name"), PyString_FromStringAndSize(failure.name().data(), failure.name().size()));
set_dict_item_and_transfer_ownership(failure_dict, PyString_FromString("error_message"), PyString_FromStringAndSize(failure.error_message().data(), failure.error_message().size()));
return failure_dict;
}
static PyObject* task_info(PyObject* self, PyObject* args) {
Worker* worker;
if (!PyArg_ParseTuple(args, "O&", &PyObjectToWorker, &worker)) {
return NULL;
}
ClientContext context;
TaskInfoRequest request;
TaskInfoReply reply;
worker->task_info(context, request, reply);
PyObject* failed_tasks_list = PyList_New(reply.failed_task_size());
for (size_t i = 0; i < reply.failed_task_size(); ++i) {
const TaskStatus& info = reply.failed_task(i);
PyObject* info_dict = PyDict_New();
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("worker_address"), PyString_FromStringAndSize(info.worker_address().data(), info.worker_address().size()));
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("function_name"), PyString_FromStringAndSize(info.function_name().data(), info.function_name().size()));
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("operationid"), PyInt_FromLong(info.operationid()));
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("error_message"), PyString_FromStringAndSize(info.error_message().data(), info.error_message().size()));
PyList_SetItem(failed_tasks_list, i, info_dict);
}
PyObject* running_tasks_list = PyList_New(reply.running_task_size());
for (size_t i = 0; i < reply.running_task_size(); ++i) {
const TaskStatus& info = reply.running_task(i);
PyObject* info_dict = PyDict_New();
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("worker_address"), PyString_FromStringAndSize(info.worker_address().data(), info.worker_address().size()));
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("function_name"), PyString_FromStringAndSize(info.function_name().data(), info.function_name().size()));
set_dict_item_and_transfer_ownership(info_dict, PyString_FromString("operationid"), PyInt_FromLong(info.operationid()));
PyList_SetItem(running_tasks_list, i, info_dict);
}
PyObject* failed_remote_function_imports = PyList_New(reply.failed_remote_function_import_size());
for (size_t i = 0; i < reply.failed_remote_function_import_size(); ++i) {
PyList_SetItem(failed_remote_function_imports, i, failure_to_dict(reply.failed_remote_function_import(i)));
}
PyObject* failed_reusable_variable_imports = PyList_New(reply.failed_reusable_variable_import_size());
for (size_t i = 0; i < reply.failed_reusable_variable_import_size(); ++i) {
PyList_SetItem(failed_reusable_variable_imports, i, failure_to_dict(reply.failed_reusable_variable_import(i)));
}
PyObject* failed_reinitialize_reusable_variables = PyList_New(reply.failed_reinitialize_reusable_variable_size());
for (size_t i = 0; i < reply.failed_reinitialize_reusable_variable_size(); ++i) {
PyList_SetItem(failed_reinitialize_reusable_variables, i, failure_to_dict(reply.failed_reinitialize_reusable_variable(i)));
}
PyObject* failed_function_to_runs = PyList_New(reply.failed_function_to_run_size());
for (size_t i = 0; i < reply.failed_function_to_run_size(); ++i) {
PyList_SetItem(failed_function_to_runs, i, failure_to_dict(reply.failed_function_to_run(i)));
}
PyObject* dict = PyDict_New();
set_dict_item_and_transfer_ownership(dict, PyString_FromString("failed_tasks"), failed_tasks_list);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("running_tasks"), running_tasks_list);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("failed_remote_function_imports"), failed_remote_function_imports);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("failed_reusable_variable_imports"), failed_reusable_variable_imports);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("failed_reinitialize_reusable_variables"), failed_reinitialize_reusable_variables);
set_dict_item_and_transfer_ownership(dict, PyString_FromString("failed_function_to_runs"), failed_function_to_runs);
return dict;
}
static PyObject* dump_computation_graph(PyObject* self, PyObject* args) {
Worker* worker;
const char* output_file_name;
if (!PyArg_ParseTuple(args, "O&s", &PyObjectToWorker, &worker, &output_file_name)) {
return NULL;
}
ClientContext context;
SchedulerInfoRequest request;
SchedulerInfoReply reply;
worker->scheduler_info(context, request, reply);
std::fstream output(output_file_name, std::ios::out | std::ios::trunc | std::ios::binary);
RAY_CHECK(reply.computation_graph().SerializeToOstream(&output), "Cannot dump computation graph to file " << output_file_name);
Py_RETURN_NONE;
}
static PyObject* kill_workers(PyObject* self, PyObject* args) {
Worker* worker;
if (!PyArg_ParseTuple(args, "O&", &PyObjectToWorker, &worker)) {
return NULL;
}
ClientContext context;
if (worker->kill_workers(context)) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
}
static PyMethodDef RayLibMethods[] = {
{ "serialize_objectid", serialize_objectid, METH_VARARGS, "serialize an object id" },
{ "deserialize_objectid", deserialize_objectid, METH_VARARGS, "deserialize an object id" },
{ "allocate_buffer", allocate_buffer, METH_VARARGS, "Allocates and returns buffer for objectid."},
{ "finish_buffer", finish_buffer, METH_VARARGS, "Makes the buffer immutable and closes memory segment of objectid."},
{ "get_buffer", get_buffer, METH_VARARGS, "Gets buffer for objectid"},
{ "is_arrow", is_arrow, METH_VARARGS, "is the object in the local object store an arrow object?"},
{ "unmap_object", unmap_object, METH_VARARGS, "unmap the object from the client's shared memory pool"},
{ "serialize_task", serialize_task, METH_VARARGS, "serialize a task to protocol buffers" },
{ "create_worker", create_worker, METH_VARARGS, "connect to the scheduler and the object store" },
{ "disconnect", disconnect, METH_VARARGS, "disconnect the worker from the scheduler and the object store" },
{ "connected", connected, METH_VARARGS, "check if the worker is connected to the scheduler and the object store" },
{ "register_remote_function", register_remote_function, METH_VARARGS, "register a function with the scheduler" },
{ "notify_failure", notify_failure, METH_VARARGS, "notify the scheduler of a failure" },
{ "add_contained_objectids", add_contained_objectids, METH_VARARGS, "notify the scheduler about the object IDs contained in a remote object" },
{ "get_objectid", get_objectid, METH_VARARGS, "register a new object reference with the scheduler" },
{ "request_object" , request_object, METH_VARARGS, "request an object to be delivered to the local object store" },
{ "wait" , wait, METH_VARARGS, "checks the scheduler to see if a object can be gotten" },
{ "alias_objectids", alias_objectids, METH_VARARGS, "make two objectids refer to the same object" },
{ "wait_for_next_message", wait_for_next_message, METH_VARARGS, "get next message from scheduler (blocking)" },
{ "submit_task", submit_task, METH_VARARGS, "call a remote function" },
{ "ready_for_new_task", ready_for_new_task, METH_VARARGS, "notify the scheduler that the worker is ready for a new task" },
{ "scheduler_info", scheduler_info, METH_VARARGS, "get info about scheduler state" },
{ "task_info", task_info, METH_VARARGS, "get information about task statuses and failures" },
{ "run_function_on_all_workers", run_function_on_all_workers, METH_VARARGS, "run an arbitrary function on all workers" },
{ "export_remote_function", export_remote_function, METH_VARARGS, "export a remote function to workers" },
{ "export_reusable_variable", export_reusable_variable, METH_VARARGS, "export a reusable variable to the workers" },
{ "dump_computation_graph", dump_computation_graph, METH_VARARGS, "dump the current computation graph to a file" },
{ "kill_workers", kill_workers, METH_VARARGS, "kills all of the workers" },
{ NULL, NULL, 0, NULL }
};
PyMODINIT_FUNC initlibraylib(void) {
PyObject* m;
PyObjectIDType.tp_new = PyType_GenericNew;
if (PyType_Ready(&PyObjectIDType) < 0) {
return;
}
m = Py_InitModule3("libraylib", RayLibMethods, "Python C Extension for Ray");
Py_INCREF(&PyObjectIDType);
PyModule_AddObject(m, "ObjectID", (PyObject *)&PyObjectIDType);
char ray_error[] = "ray.error";
char ray_size_error[] = "ray_size.error";
RayError = PyErr_NewException(ray_error, NULL, NULL);
RaySizeError = PyErr_NewException(ray_size_error, NULL, NULL);
Py_INCREF(RayError);
Py_INCREF(RaySizeError);
PyModule_AddObject(m, "ray_error", RayError);
PyModule_AddObject(m, "ray_size_error", RaySizeError);
import_array();
// Export constants used for the worker mode types so they can be accessed
// from Python. The Mode enum is defined in worker.h.
PyModule_AddIntConstant(m, "SCRIPT_MODE", Mode::SCRIPT_MODE);
PyModule_AddIntConstant(m, "WORKER_MODE", Mode::WORKER_MODE);
PyModule_AddIntConstant(m, "PYTHON_MODE", Mode::PYTHON_MODE);
PyModule_AddIntConstant(m, "SILENT_MODE", Mode::SILENT_MODE);
// Export constants for the failure types so they can be accessed from Python.
// The FailedType enum is defined in types.proto.
PyModule_AddIntConstant(m, "FailedTask", FailedType::FailedTask);
PyModule_AddIntConstant(m, "FailedRemoteFunctionImport", FailedType::FailedRemoteFunctionImport);
PyModule_AddIntConstant(m, "FailedReusableVariableImport", FailedType::FailedReusableVariableImport);
PyModule_AddIntConstant(m, "FailedReinitializeReusableVariable", FailedType::FailedReinitializeReusableVariable);
PyModule_AddIntConstant(m, "FailedFunctionToRun", FailedType::FailedFunctionToRun);
}
}
-1187
View File
File diff suppressed because it is too large Load Diff
-237
View File
@@ -1,237 +0,0 @@
#ifndef RAY_SCHEDULER_H
#define RAY_SCHEDULER_H
#include <deque>
#include <memory>
#include <algorithm>
#include <iostream>
#include <limits>
#include <grpc++/grpc++.h>
#include "ray/ray.h"
#include "ray.grpc.pb.h"
#include "types.pb.h"
#include "utils.h"
#include "computation_graph.h"
using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerReader;
using grpc::ServerContext;
using grpc::Status;
using grpc::ClientContext;
using grpc::Channel;
typedef size_t RefCount;
const ObjectID UNITIALIZED_ALIAS = std::numeric_limits<ObjectID>::max();
const RefCount DEALLOCATED = std::numeric_limits<RefCount>::max();
struct WorkerHandle {
std::shared_ptr<Channel> channel;
std::unique_ptr<WorkerService::Stub> worker_stub; // If null, the worker has died
ObjStoreId objstoreid;
std::string worker_address;
// This field is initialized to false, and it is set to true after all of the
// initial exports have been shipped to this worker.
bool initial_exports_done;
OperationId current_task;
};
struct ObjStoreHandle {
std::shared_ptr<Channel> channel;
std::unique_ptr<ObjStore::Stub> objstore_stub;
std::string address;
};
enum SchedulingAlgorithmType {
SCHEDULING_ALGORITHM_NAIVE = 0,
SCHEDULING_ALGORITHM_LOCALITY_AWARE = 1
};
class SchedulerService : public Scheduler::Service {
public:
SchedulerService(SchedulingAlgorithmType scheduling_algorithm);
Status SubmitTask(ServerContext* context, const SubmitTaskRequest* request, SubmitTaskReply* reply) override;
Status PutObj(ServerContext* context, const PutObjRequest* request, PutObjReply* reply) override;
Status RequestObj(ServerContext* context, const RequestObjRequest* request, AckReply* reply) override;
Status AliasObjectIDs(ServerContext* context, const AliasObjectIDsRequest* request, AckReply* reply) override;
Status RegisterObjStore(ServerContext* context, const RegisterObjStoreRequest* request, RegisterObjStoreReply* reply) override;
Status RegisterWorker(ServerContext* context, const RegisterWorkerRequest* request, RegisterWorkerReply* reply) override;
Status RegisterRemoteFunction(ServerContext* context, const RegisterRemoteFunctionRequest* request, AckReply* reply) override;
Status ObjReady(ServerContext* context, const ObjReadyRequest* request, AckReply* reply) override;
Status ReadyForNewTask(ServerContext* context, const ReadyForNewTaskRequest* request, AckReply* reply) override;
Status IncrementRefCount(ServerContext* context, const IncrementRefCountRequest* request, AckReply* reply) override;
Status DecrementRefCount(ServerContext* context, const DecrementRefCountRequest* request, AckReply* reply) override;
Status AddContainedObjectIDs(ServerContext* context, const AddContainedObjectIDsRequest* request, AckReply* reply) override;
Status SchedulerInfo(ServerContext* context, const SchedulerInfoRequest* request, SchedulerInfoReply* reply) override;
Status TaskInfo(ServerContext* context, const TaskInfoRequest* request, TaskInfoReply* reply) override;
Status KillWorkers(ServerContext* context, const KillWorkersRequest* request, KillWorkersReply* reply) override;
Status RunFunctionOnAllWorkers(ServerContext* context, const RunFunctionOnAllWorkersRequest* request, AckReply* reply) override;
Status ExportRemoteFunction(ServerContext* context, const ExportRemoteFunctionRequest* request, AckReply* reply) override;
Status ExportReusableVariable(ServerContext* context, const ExportReusableVariableRequest* request, AckReply* reply) override;
Status NotifyFailure(ServerContext*, const NotifyFailureRequest* request, AckReply* reply) override;
Status Wait(ServerContext*, const WaitRequest* request, WaitReply* reply) override;
#ifdef NDEBUG
// If we've disabled assertions, then just use regular SynchronizedPtr to skip lock checking.
template<class T>
using MySynchronizedPtr = SynchronizedPtr<T>;
#else
// A SynchronizedPtr specialized for this class to dynamically check that locks are obtained in the correct order (in the order of field declarations).
template<class T>
class MySynchronizedPtr;
#endif
// This will ask an object store to send an object to another object store if
// the object is not already present in that object store and is not already
// being transmitted.
void deliver_object_async_if_necessary(ObjectID objectid, ObjStoreId from, ObjStoreId to);
// ask an object store to send object to another object store
void deliver_object_async(ObjectID objectid, ObjStoreId from, ObjStoreId to);
// assign a task to a worker
void schedule();
// execute a task on a worker and ship required object IDs
void assign_task(OperationId operationid, WorkerId workerid, const MySynchronizedPtr<ComputationGraph> &computation_graph);
// checks if the dependencies of the task are met
bool can_run(const Task& task);
// register a new object with the scheduler and return its object ID
ObjectID register_new_object();
// register the location of the object ID in the object table
void add_location(ObjectID objectid, ObjStoreId objstoreid);
// indicate that objectid is a canonical objectid
void add_canonical_objectid(ObjectID objectid);
// get object store associated with a workerid
ObjStoreId get_store(WorkerId workerid);
// register a function with the scheduler
void register_function(const std::string& name, WorkerId workerid, size_t num_return_vals);
// get information about the scheduler state
void get_info(const SchedulerInfoRequest& request, SchedulerInfoReply* reply);
private:
// pick an objectstore that holds a given object (needs protection by objects_lock_)
ObjStoreId pick_objstore(ObjectID objectid);
// checks if objectid is a canonical objectid
bool is_canonical(ObjectID objectid);
// Perform all queued up gets that can be performed.
void perform_gets();
// schedule tasks using the naive algorithm
void schedule_tasks_naively();
// schedule tasks using a scheduling algorithm that takes into account data locality
void schedule_tasks_location_aware();
void perform_notify_aliases();
// checks if aliasing for objectid has been completed
bool has_canonical_objectid(ObjectID objectid);
// get the canonical objectid for an objectid
ObjectID get_canonical_objectid(ObjectID objectid);
// attempt to notify the objstore about potential objectid aliasing, returns true if successful, if false then retry later
bool attempt_notify_alias(ObjStoreId objstoreid, ObjectID alias_objectid, ObjectID canonical_objectid);
// tell all of the objstores holding canonical_objectid to deallocate it, the
// data structures are passed into ensure that the appropriate locks are held.
void deallocate_object(ObjectID canonical_objectid, const MySynchronizedPtr<std::vector<RefCount> > &reference_counts, const MySynchronizedPtr<std::vector<std::vector<ObjectID> > > &contained_objectids);
// increment the ref counts for the object IDs in objectids, the data
// structures are passed into ensure that the appropriate locks are held.
void increment_ref_count(const std::vector<ObjectID> &objectids, const MySynchronizedPtr<std::vector<RefCount> > &reference_count);
// decrement the ref counts for the object IDs in objectids, the data
// structures are passed into ensure that the appropriate locks are held.
void decrement_ref_count(const std::vector<ObjectID> &objectids, const MySynchronizedPtr<std::vector<RefCount> > &reference_count, const MySynchronizedPtr<std::vector<std::vector<ObjectID> > > &contained_objectids);
// Find all of the object IDs which are upstream of objectid (including objectid itself). That is, you can get from everything in objectids to objectid by repeatedly indexing in target_objectids_.
void upstream_objectids(ObjectID objectid, std::vector<ObjectID> &objectids, const MySynchronizedPtr<std::vector<std::vector<ObjectID> > > &reverse_target_objectids);
// Find all of the object IDs that refer to the same object as objectid (as best as we can determine at the moment). The information may be incomplete because not all of the aliases may be known.
void get_equivalent_objectids(ObjectID objectid, std::vector<ObjectID> &equivalent_objectids);
// Export a function to run to a worker.
void export_function_to_run_to_worker(WorkerId workerid, int function_index, MySynchronizedPtr<std::vector<WorkerHandle> > &workers, const MySynchronizedPtr<std::vector<std::unique_ptr<Function> > > &exported_functions_to_run);
// Export a remote function to a worker.
void export_remote_function_to_worker(WorkerId workerid, int function_index, MySynchronizedPtr<std::vector<WorkerHandle> > &workers, const MySynchronizedPtr<std::vector<std::unique_ptr<Function> > > &exported_remote_functions);
// Export a reusable variable to a worker
void export_reusable_variable_to_worker(WorkerId workerid, int reusable_variable_index, MySynchronizedPtr<std::vector<WorkerHandle> > &workers, const MySynchronizedPtr<std::vector<std::unique_ptr<ReusableVar> > > &exported_reusable_variables);
// Export all exports to all workers that need them. This happens the first
// time any export would be exported to a worker or when a worker first calls
// ReadyForNewTask.
void export_everything_to_all_workers_if_necessary(MySynchronizedPtr<std::vector<WorkerHandle> > &workers);
template<class T>
MySynchronizedPtr<T> get(Synchronized<T>& my_field, const char* name,unsigned int line_number);
template<class T>
MySynchronizedPtr<const T> get(const Synchronized<T>& my_field, const char* name,unsigned int line_number) const;
// Preferably keep this as the first field to distinguish it from the rest
// Maps every thread to an identifier of a lock it is holding, as well the name of the lock.
// Internally, the identifier for each lock is the offset of the field being locked.
// When we lock, we set the field offset and store the difference; the difference should always be positive. If not, we throw.
// When we unlock, we subtract back the field offset to restore it to the previous field that was locked.
mutable Synchronized<std::vector<std::pair<unsigned long long, std::pair<size_t, const char*> > > > lock_orders_;
// List of failed tasks
Synchronized<std::vector<TaskStatus> > failed_tasks_;
// A list of remote functions import failures.
Synchronized<std::vector<Failure> > failed_remote_function_imports_;
// A list of reusable variables import failures.
Synchronized<std::vector<Failure> > failed_reusable_variable_imports_;
// A list of reusable variables reinitialization failures.
Synchronized<std::vector<Failure> > failed_reinitialize_reusable_variables_;
// A list of function to run failures.
Synchronized<std::vector<Failure> > failed_function_to_runs_;
// List of pending get calls.
Synchronized<std::vector<std::pair<WorkerId, ObjectID> > > get_queue_;
// The computation graph tracks the operations that have been submitted to the
// scheduler and is mostly used for fault tolerance.
Synchronized<ComputationGraph> computation_graph_;
// Hash map from function names to workers where the function is registered.
Synchronized<FnTable> fntable_;
// Vector of all workers that are currently idle.
Synchronized<std::vector<WorkerId> > avail_workers_;
// List of pending tasks.
Synchronized<std::deque<OperationId> > task_queue_;
// Reference counts. Currently, reference_counts_[objectid] is the number of
// existing references held to objectid. This is done for all objectids, not just
// canonical_objectids. This data structure completely ignores aliasing. If the
// object corresponding to objectid has been deallocated, then
// reference_counts[objectid] will equal DEALLOCATED.
Synchronized<std::vector<RefCount> > reference_counts_;
// contained_objectids_[objectid] is a vector of all of the objectids contained inside the object referred to by objectid
Synchronized<std::vector<std::vector<ObjectID> > > contained_objectids_;
// Vector of all workers registered in the system. Their index in this vector
// is the workerid.
Synchronized<std::vector<WorkerHandle> > workers_;
// List of pending alias notifications. Each element consists of (objstoreid, (alias_objectid, canonical_objectid)).
Synchronized<std::vector<std::pair<ObjStoreId, std::pair<ObjectID, ObjectID> > > > alias_notification_queue_;
// Mapping from canonical objectid to list of object stores where the object is stored. Non-canonical (aliased) objectids should not be used to index objtable_.
Synchronized<ObjTable> objtable_; // This lock protects objtable_ and objects_in_transit_
// Vector of all object stores registered in the system. Their index in this
// vector is the objstoreid.
Synchronized<std::vector<ObjStoreHandle> > objstores_;
// Mapping from an aliased objectid to the objectid it is aliased with. If an
// objectid is a canonical objectid (meaning it is not aliased), then
// target_objectids_[objectid] == objectid. For each objectid, target_objectids_[objectid]
// is initialized to UNITIALIZED_ALIAS and the correct value is filled later
// when it is known.
Synchronized<std::vector<ObjectID> > target_objectids_;
// This data structure maps an objectid to all of the objectids that alias it (there could be multiple such objectids).
Synchronized<std::vector<std::vector<ObjectID> > > reverse_target_objectids_;
// For each object store objstoreid, objects_in_transit_[objstoreid] is a
// vector of the canonical object IDs that are being streamed to that
// object store but are not yet present. object IDs are added to this
// in deliver_object_async_if_necessary (to ensure that we do not attempt to deliver
// the same object to a given object store twice), and object IDs are
// removed when add_location is called (from ObjReady), and they are moved to
// the objtable_. Note that objects_in_transit_ and objtable_ share the same
// lock (objects_lock_). // TODO(rkn): Consider making this part of the
// objtable data structure.
std::vector<std::vector<ObjectID> > objects_in_transit_;
// All of the functions that have been exported to the workers to run.
Synchronized<std::vector<std::unique_ptr<Function> > > exported_functions_to_run_;
// All of the remote functions that have been exported to the workers.
Synchronized<std::vector<std::unique_ptr<Function> > > exported_remote_functions_;
// All of the reusable variables that have been exported to the workers.
Synchronized<std::vector<std::unique_ptr<ReusableVar> > > exported_reusable_variables_;
// the scheduling algorithm that will be used
SchedulingAlgorithmType scheduling_algorithm_;
};
#endif
-69
View File
@@ -1,69 +0,0 @@
#include "utils.h"
#include "ray/ray.h"
#include <sys/stat.h>
#ifdef _S_IREAD // Visual C++ runtime?
#include <direct.h> // _mkdir
#else
namespace {
int _mkdir(char const* path) {
return mkdir(path, S_IRWXU | S_IRWXG | S_IRWXO);
}
}
#endif
std::string::iterator split_ip_address(std::string& ip_address) {
if (ip_address[0] == '[') { // IPv6
auto split_end = std::find(ip_address.begin() + 1, ip_address.end(), ']');
if(split_end != ip_address.end()) {
split_end++;
}
if(split_end != ip_address.end() && *split_end == ':') {
return split_end;
}
RAY_CHECK(false, "ip address should contain a port number");
} else { // IPv4
auto split_point = std::find(ip_address.rbegin(), ip_address.rend(), ':').base();
RAY_CHECK_NEQ(split_point, ip_address.begin(), "ip address should contain a port number");
return split_point;
}
}
const char* get_cmd_option(char** begin, char** end, const std::string& option) {
char** it = std::find(begin, end, option);
if (it != end && ++it != end) {
return *it;
}
return 0;
}
void create_directories(const char* log_file_name) {
bool success = _mkdir(log_file_name) != -1 || errno == EEXIST;
if (!success) {
// If we couldn't create it directly and it didn't already exist, then try to create it from the root...
// Note that we keep going until the end even if creating the root fails, because we don't necessarily have access to the root
bool stop = false;
size_t i = 0;
do {
stop = log_file_name[i] == '\0';
bool delimiter = stop || log_file_name[i] == '/' || log_file_name[i] == '\\';
if (!stop) {
++i;
}
if (delimiter) {
std::string ancestor(log_file_name, i);
success = _mkdir(ancestor.c_str()) != -1 || errno == EEXIST;
}
} while (!stop);
}
RAY_CHECK(success, "Failed to create directory for " << log_file_name);
}
void create_log_dir_or_die(const char* log_file_name) {
std::string dirname = log_file_name;
while (!dirname.empty() && dirname.back() != '/' && dirname.back() != '\\') {
dirname.pop_back();
}
return create_directories(dirname.c_str());
}
-97
View File
@@ -1,97 +0,0 @@
#ifndef RAY_UTILS_H
#define RAY_UTILS_H
#include <mutex>
#include <string>
template<class T = void, class Mutex = std::mutex>
class Synchronized;
template<class T, class Mutex>
class Synchronized<const T, Mutex>; // Prevent use of const T; it doesn't make sense
template<class T, class Mutex> struct SynchronizedSource { typedef Synchronized<T, Mutex> type; };
template<class T, class Mutex> struct SynchronizedSource<const T, Mutex> { typedef const Synchronized<T, Mutex> type; };
template<class T, class Mutex> struct SynchronizedSource<volatile T, Mutex> { typedef volatile Synchronized<T, Mutex> type; };
template<class T, class Mutex> struct SynchronizedSource<const volatile T, Mutex> { typedef const Synchronized<T, Mutex> type; };
template<class T>
class SynchronizedPtr : public std::unique_lock<typename SynchronizedSource<T, void>::type> {
protected:
typedef std::unique_lock<typename SynchronizedSource<T, void>::type> base_type;
// Make these private; they don't make much sense externally...
using base_type::mutex;
public:
typedef T value_type;
SynchronizedPtr(typename base_type::mutex_type& value) : base_type(value) { }
value_type& operator*() const { return *mutex()->unsafe_get(); }
value_type* operator->() const { return mutex() ? mutex()->unsafe_get() : NULL; }
};
template<class T>
class Synchronized<T, void> {
T value_;
public:
typedef T element_type;
template<class... U>
Synchronized(U&&... args) : value_(std::forward<U>(args)...) { }
Synchronized(const Synchronized& other) : value_((std::lock_guard<Synchronized>(other), other.value_)) { }
Synchronized(Synchronized&& other) : value_((std::lock_guard<Synchronized>(other), std::move(other.value_))) { }
Synchronized& operator =(const Synchronized& other)
{
if (this != &other)
{
std::lock_guard<Synchronized> guard_this(*this);
std::lock_guard<Synchronized> guard_other(other);
value_ = other.value_;
}
return *this;
}
Synchronized& operator =(Synchronized&& other)
{
if (this != &other)
{
std::lock_guard<Synchronized> guard_this(*this);
std::lock_guard<Synchronized> guard_other(other);
value_ = std::move(other.value_);
}
return *this;
}
virtual void lock() const = 0;
virtual void unlock() const = 0;
virtual bool try_lock() const = 0;
element_type* unsafe_get() { return &value_; }
const element_type* unsafe_get() const { return &value_; }
};
template<class Mutex>
class Synchronized<void, Mutex> {
mutable Mutex mutex_;
public:
typedef Mutex mutex_type;
void lock() const { return mutex_.lock(); }
void unlock() const { return mutex_.unlock(); }
bool try_lock() const { return mutex_.try_lock(); }
};
template<class T, class Mutex>
class Synchronized : public Synchronized<T, void>, public Synchronized<void, Mutex> {
typedef Synchronized<T, void> base1_type;
typedef Synchronized<void, Mutex> base2_type;
public:
template<class... U>
Synchronized(U&&... args) : base1_type(std::forward<U>(args)...), base2_type() { }
SynchronizedPtr<T> unchecked_get() { return *this; }
SynchronizedPtr<const T> unchecked_get() const { return *this; }
void lock() const override { return base2_type::lock(); }
void unlock() const override { return base2_type::unlock(); }
bool try_lock() const override { return base2_type::try_lock(); }
};
std::string::iterator split_ip_address(std::string& ip_address);
const char* get_cmd_option(char** begin, char** end, const std::string& option);
void create_log_dir_or_die(const char* log_file_name);
#endif
-497
View File
@@ -1,497 +0,0 @@
#include "worker.h"
#include <atomic>
#include <random>
#include <chrono>
#include <thread>
#include "utils.h"
extern "C" {
static PyObject *RayError;
}
inline WorkerServiceImpl::WorkerServiceImpl(const std::string& send_queue_name, Mode mode)
: mode_(mode) {
RAY_LOG(RAY_INFO, "Worker service connecting to queue " << send_queue_name);
RAY_CHECK(send_queue_.connect(send_queue_name, false), "error connecting send_queue_");
}
Status WorkerServiceImpl::ExecuteTask(ServerContext* context, const ExecuteTaskRequest* request, AckReply* reply) {
RAY_CHECK(mode_ == Mode::WORKER_MODE, "ExecuteTask can only be called on workers.");
RAY_LOG(RAY_INFO, "invoked task " << request->task().name());
std::unique_ptr<WorkerMessage> message(new WorkerMessage());
message->mutable_task()->CopyFrom(request->task());
{
WorkerMessage* message_ptr = message.get();
RAY_CHECK(send_queue_.send(&message_ptr), "Failed to send message from the worker service to the worker because the message queue was full.");
}
// The message will get deleted in receive_next_message().
message.release();
return Status::OK;
}
Status WorkerServiceImpl::RunFunctionOnWorker(ServerContext* context, const RunFunctionOnWorkerRequest* request, AckReply* reply) {
RAY_CHECK(mode_ == Mode::WORKER_MODE, "RunFunctionOnWorker can only be called on workers.");
std::unique_ptr<WorkerMessage> message(new WorkerMessage());
message->mutable_function_to_run()->CopyFrom(request->function());
RAY_LOG(RAY_INFO, "Running function on worker.");
{
WorkerMessage* message_ptr = message.get();
RAY_CHECK(send_queue_.send(&message_ptr), "Failed to send message from the worker service to the worker because the message queue was full.");
}
// The message will get deleted in receive_next_message().
message.release();
return Status::OK;
}
Status WorkerServiceImpl::ImportRemoteFunction(ServerContext* context, const ImportRemoteFunctionRequest* request, AckReply* reply) {
RAY_CHECK(mode_ == Mode::WORKER_MODE, "ImportRemoteFunction can only be called on workers.");
std::unique_ptr<WorkerMessage> message(new WorkerMessage());
message->mutable_function()->CopyFrom(request->function());
RAY_LOG(RAY_INFO, "importing function");
{
WorkerMessage* message_ptr = message.get();
RAY_CHECK(send_queue_.send(&message_ptr), "Failed to send message from the worker service to the worker because the message queue was full.");
}
// The message will get deleted in receive_next_message().
message.release();
return Status::OK;
}
Status WorkerServiceImpl::ImportReusableVariable(ServerContext* context, const ImportReusableVariableRequest* request, AckReply* reply) {
RAY_CHECK(mode_ == Mode::WORKER_MODE, "ImportReusableVariable can only be called on workers.");
std::unique_ptr<WorkerMessage> message(new WorkerMessage());
message->mutable_reusable_variable()->CopyFrom(request->reusable_variable());
RAY_LOG(RAY_INFO, "importing reusable variable");
{
WorkerMessage* message_ptr = message.get();
RAY_CHECK(send_queue_.send(&message_ptr), "Failed to send message from the worker service to the worker because the message queue was full.");
}
// The message will get deleted in receive_next_message().
message.release();
return Status::OK;
}
Status WorkerServiceImpl::Die(ServerContext* context, const DieRequest* request, AckReply* reply) {
RAY_CHECK(mode_ == Mode::WORKER_MODE, "Die can only be called on workers.");
WorkerMessage* message_ptr = NULL;
RAY_CHECK(send_queue_.send(&message_ptr), "Failed to send message from the worker service to the worker because the message queue was full.");
return Status::OK;
}
Status WorkerServiceImpl::PrintErrorMessage(ServerContext* context, const PrintErrorMessageRequest* request, AckReply* reply) {
RAY_CHECK(mode_ != Mode::WORKER_MODE, "PrintErrorMessage can only be called on drivers.");
if (mode_ == Mode::SILENT_MODE) {
// Do not log error messages in this case. This is just used for the tests.
return Status::OK;
}
const Failure failure = request->failure();
WorkerId workerid = failure.workerid();
if (failure.type() == FailedType::FailedTask) {
// A task threw an exception while executing.
std::cout << "Error: Worker " << workerid << " failed to execute function " << failure.name() << ". Failed with error message:\n" << failure.error_message() << std::endl;
} else if (failure.type() == FailedType::FailedRemoteFunctionImport) {
// An exception was thrown while a remote function was being imported.
std::cout << "Error: Worker " << workerid << " failed to import remote function " << failure.name() << ", failed with error message:\n" << failure.error_message() << std::endl;
} else if (failure.type() == FailedType::FailedReusableVariableImport) {
// An exception was thrown while a reusable variable was being imported.
std::cout << "Error: Worker " << workerid << " failed to import reusable variable " << failure.name() << ", failed with error message:\n" << failure.error_message() << std::endl;
} else if (failure.type() == FailedType::FailedReinitializeReusableVariable) {
// An exception was thrown while a reusable variable was being reinitialized.
std::cout << "Error: Worker " << workerid << " failed to reinitialize a reusable variable after running remote function " << failure.name() << ", failed with error message:\n" << failure.error_message() << std::endl;
} else if (failure.type() == FailedType::FailedFunctionToRun) {
// An exception was thrown while a function was being run on all workers.
std::cout << "Error: Worker " << workerid << " failed to run function " << failure.name() << " on all workers, failed with error message:\n" << failure.error_message() << std::endl;
} else {
RAY_CHECK(false, "This code should be unreachable.")
}
return Status::OK;
}
Worker::Worker(const std::string& node_ip_address, const std::string& scheduler_address, Mode mode)
: scheduler_address_(scheduler_address),
node_ip_address_(node_ip_address),
mode_(mode) {
auto scheduler_channel = grpc::CreateChannel(scheduler_address, grpc::InsecureChannelCredentials());
scheduler_stub_ = Scheduler::NewStub(scheduler_channel);
// Generate a random string to use for naming the message queue to avoid
// collisions with message queues created by other workers.
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_int_distribution<int> queue_name_generator(0, 10000000);
receive_queue_name_ = "worker_receive_queue:" + std::to_string(queue_name_generator(rng));
RAY_LOG(RAY_INFO, "Worker creating queue " << receive_queue_name_);
RAY_CHECK(receive_queue_.connect(receive_queue_name_, true), "error connecting receive_queue_");
}
SubmitTaskReply Worker::submit_task(SubmitTaskRequest* request, int max_retries, int retry_wait_milliseconds) {
RAY_CHECK(connected_, "Attempted to perform submit_task but failed.");
SubmitTaskReply reply;
request->set_workerid(workerid_);
for (int i = 0; i < 1 + max_retries; ++i) {
ClientContext context;
RAY_CHECK_GRPC(scheduler_stub_->SubmitTask(&context, *request, &reply));
if (reply.function_registered()) {
break;
}
RAY_LOG(RAY_INFO, "The function " << request->task().name() << " was not registered, so attempting to resubmit the task.");
std::this_thread::sleep_for(std::chrono::milliseconds(retry_wait_milliseconds));
}
return reply;
}
bool Worker::kill_workers(ClientContext &context) {
KillWorkersRequest request;
KillWorkersReply reply;
RAY_CHECK_GRPC(scheduler_stub_->KillWorkers(&context, request, &reply));
return reply.success();
}
void Worker::register_worker(const std::string& node_ip_address, const std::string& objstore_address, bool is_driver) {
if (mode_ == Mode::WORKER_MODE) {
start_worker_service(mode_);
RAY_CHECK(!worker_address_.empty(), "The worker address is empty. This should be initialized by start_worker_service, so it is possible that the thread synchronization failed.")
}
unsigned int retry_wait_milliseconds = 20;
RegisterWorkerRequest request;
request.set_node_ip_address(node_ip_address);
request.set_worker_address(worker_address_);
// The object store address can be the empty string, in which case the
// scheduler will assign an object store address.
request.set_objstore_address(objstore_address);
request.set_is_driver(is_driver);
RegisterWorkerReply reply;
Status status;
// TODO: HACK: retrying is a hack
for (int i = 0; i < 5; ++i) {
ClientContext context;
status = scheduler_stub_->RegisterWorker(&context, request, &reply);
if (status.error_code() != grpc::UNAVAILABLE) {
break;
}
// Note that each pass through the loop may take substantially longer than
// retry_wait_milliseconds because grpc may do its own retrying.
std::this_thread::sleep_for(std::chrono::milliseconds(retry_wait_milliseconds));
}
RAY_CHECK_GRPC(status);
workerid_ = reply.workerid();
objstoreid_ = reply.objstoreid();
objstore_address_ = reply.objstore_address();
segmentpool_ = std::make_shared<MemorySegmentPool>(objstoreid_, objstore_address_, false);
// Connect to the queue for sending requests to the object store.
std::string request_obj_queue_name = std::string("queue:") + objstore_address_ + std::string(":obj");
RAY_LOG(RAY_INFO, "Worker connecting to queue with name " << request_obj_queue_name << " to send requests to the object store.");
RAY_CHECK(request_obj_queue_.connect(request_obj_queue_name, false), "error connecting request_obj_queue_");
// Create a queue for receiving messages from the object store.
std::string receive_obj_queue_name = std::string("queue:") + objstore_address_ + std::string(":worker:") + std::to_string(workerid_) + std::string(":obj");
RAY_LOG(RAY_INFO, "Worker creating queue with name " << receive_obj_queue_name << " to receive messages from the object store.");
RAY_CHECK(receive_obj_queue_.connect(receive_obj_queue_name, true), "error connecting receive_obj_queue_");
connected_ = true;
return;
}
void Worker::request_object(ObjectID objectid) {
RAY_CHECK(connected_, "Attempted to perform request_object but failed.");
RequestObjRequest request;
request.set_workerid(workerid_);
request.set_objectid(objectid);
AckReply reply;
ClientContext context;
RAY_CHECK_GRPC(scheduler_stub_->RequestObj(&context, request, &reply));
return;
}
ObjectID Worker::get_objectid() {
// first get objectid for the new object
RAY_CHECK(connected_, "Attempted to perform get_objectid but failed.");
PutObjRequest request;
request.set_workerid(workerid_);
PutObjReply reply;
ClientContext context;
RAY_CHECK_GRPC(scheduler_stub_->PutObj(&context, request, &reply));
return reply.objectid();
}
void Worker::add_contained_objectids(ObjectID objectid, std::vector<ObjectID> &contained_objectids) {
RAY_CHECK(connected_, "Attempted to perform add_contained_objectids but failed.");
if (contained_objectids.size() > 0) {
RAY_LOG(RAY_REFCOUNT, "In add_contained_objectids, calling increment_reference_count for contained objectids");
// Notify the scheduler that some object references are serialized in the
// objstore. The corresponding decrement happens when the object
// corresponding to objectid is deallocated.
increment_reference_count(contained_objectids);
// Notify the scheduler about the objectids that we are serializing in the objstore.
AddContainedObjectIDsRequest contained_objectids_request;
contained_objectids_request.set_objectid(objectid);
for (int i = 0; i < contained_objectids.size(); ++i) {
contained_objectids_request.add_contained_objectid(contained_objectids[i]); // TODO(rkn): The naming here is bad
}
AckReply reply;
ClientContext context;
RAY_CHECK_GRPC(scheduler_stub_->AddContainedObjectIDs(&context, contained_objectids_request, &reply));
}
}
#define CHECK_ARROW_STATUS(s, msg) \
do { \
arrow::Status _s = (s); \
if (!_s.ok()) { \
std::string _errmsg = std::string(msg) + _s.ToString(); \
PyErr_SetString(RayError, _errmsg.c_str()); \
return NULL; \
} \
} while (0);
const char* Worker::allocate_buffer(ObjectID objectid, int64_t size, SegmentId& segmentid) {
RAY_CHECK(connected_, "Attempted to perform put_arrow but failed.");
ObjRequest request;
request.workerid = workerid_;
request.type = ObjRequestType::ALLOC;
request.objectid = objectid;
request.size = size;
RAY_CHECK(request_obj_queue_.send(&request), "Failed to send request from the worker to the object store because the message queue was full.");
ObjHandle result;
RAY_CHECK(receive_obj_queue_.receive(&result), "error receiving over IPC");
const char* address = reinterpret_cast<const char*>(segmentpool_->get_address(result));
segmentid = result.segmentid();
return address;
}
PyObject* Worker::finish_buffer(ObjectID objectid, SegmentId segmentid, int64_t metadata_offset) {
segmentpool_->unmap_segment(segmentid);
ObjRequest request;
request.workerid = workerid_;
request.objectid = objectid;
request.type = ObjRequestType::WORKER_DONE;
request.metadata_offset = metadata_offset;
RAY_CHECK(request_obj_queue_.send(&request), "Failed to send request from the worker to the object store because the message queue was full.");
Py_RETURN_NONE;
}
const char* Worker::get_buffer(ObjectID objectid, int64_t &size, SegmentId& segmentid, int64_t& metadata_offset) {
RAY_CHECK(connected_, "Attempted to perform get_arrow but failed.");
ObjRequest request;
request.workerid = workerid_;
request.type = ObjRequestType::GET;
request.objectid = objectid;
RAY_CHECK(request_obj_queue_.send(&request), "Failed to send request from the worker to the object store because the message queue was full.");
ObjHandle result;
RAY_CHECK(receive_obj_queue_.receive(&result), "error receiving over IPC");
const char* address = reinterpret_cast<const char*>(segmentpool_->get_address(result));
size = result.size();
segmentid = result.segmentid();
metadata_offset = result.metadata_offset();
return address;
}
bool Worker::is_arrow(ObjectID objectid) {
RAY_CHECK(connected_, "Attempted to perform is_arrow but failed.");
ObjRequest request;
request.workerid = workerid_;
request.type = ObjRequestType::GET;
request.objectid = objectid;
RAY_CHECK(request_obj_queue_.send(&request), "Failed to send request from the worker to the object store because the message queue was full.");
ObjHandle result;
RAY_CHECK(receive_obj_queue_.receive(&result), "error receiving over IPC");
return result.metadata_offset() != 0;
}
void Worker::unmap_object(ObjectID objectid) {
if (!connected_) {
RAY_LOG(RAY_DEBUG, "Attempted to perform unmap_object but failed.");
return;
}
segmentpool_->unmap_segment(objectid);
}
void Worker::alias_objectids(ObjectID alias_objectid, ObjectID target_objectid) {
RAY_CHECK(connected_, "Attempted to perform alias_objectids but failed.");
ClientContext context;
AliasObjectIDsRequest request;
request.set_alias_objectid(alias_objectid);
request.set_target_objectid(target_objectid);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->AliasObjectIDs(&context, request, &reply));
}
void Worker::increment_reference_count(std::vector<ObjectID> &objectids) {
if (!connected_) {
RAY_LOG(RAY_DEBUG, "Attempting to increment_reference_count for objectids, but connected_ = " << connected_ << " so returning instead.");
return;
}
if (objectids.size() > 0) {
ClientContext context;
IncrementRefCountRequest request;
for (int i = 0; i < objectids.size(); ++i) {
RAY_LOG(RAY_REFCOUNT, "Incrementing reference count for objectid " << objectids[i]);
request.add_objectid(objectids[i]);
}
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->IncrementRefCount(&context, request, &reply));
}
}
void Worker::decrement_reference_count(std::vector<ObjectID> &objectids) {
if (!connected_) {
RAY_LOG(RAY_DEBUG, "Attempting to decrement_reference_count, but connected_ = " << connected_ << " so returning instead.");
return;
}
if (objectids.size() > 0) {
ClientContext context;
DecrementRefCountRequest request;
for (int i = 0; i < objectids.size(); ++i) {
RAY_LOG(RAY_REFCOUNT, "Decrementing reference count for objectid " << objectids[i]);
request.add_objectid(objectids[i]);
}
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->DecrementRefCount(&context, request, &reply));
}
}
void Worker::register_remote_function(const std::string& name, size_t num_return_vals) {
RAY_CHECK(connected_, "Attempted to perform register_function but failed.");
ClientContext context;
RegisterRemoteFunctionRequest request;
request.set_workerid(workerid_);
request.set_function_name(name);
request.set_num_return_vals(num_return_vals);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->RegisterRemoteFunction(&context, request, &reply));
}
void Worker::notify_failure(FailedType type, const std::string& name, const std::string& error_message) {
RAY_CHECK(connected_, "Attempted to perform notify_failure but failed.");
ClientContext context;
NotifyFailureRequest request;
request.mutable_failure()->set_type(type);
request.mutable_failure()->set_workerid(workerid_);
request.mutable_failure()->set_worker_address(worker_address_);
request.mutable_failure()->set_name(name);
request.mutable_failure()->set_error_message(error_message);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->NotifyFailure(&context, request, &reply));
}
std::unique_ptr<WorkerMessage> Worker::receive_next_message() {
WorkerMessage* message_ptr;
RAY_CHECK(receive_queue_.receive(&message_ptr), "error receiving over IPC");
return std::unique_ptr<WorkerMessage>(message_ptr);
}
void Worker::ready_for_new_task() {
RAY_CHECK(connected_, "Attempted to perform ready_for_new_task but failed.");
ClientContext context;
ReadyForNewTaskRequest request;
request.set_workerid(workerid_);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->ReadyForNewTask(&context, request, &reply));
}
void Worker::disconnect() {
connected_ = false;
// Shut down the worker service. This will cause the call to server->Wait() to
// return.
// server_ptr_->Shutdown();
// Wait for the thread that launched the worker service to return.
// worker_server_thread_.join();
}
// TODO(rkn): Should we be using pointers or references? And should they be const?
void Worker::scheduler_info(ClientContext &context, SchedulerInfoRequest &request, SchedulerInfoReply &reply) {
RAY_CHECK(connected_, "Attempted to get scheduler info but failed.");
RAY_CHECK_GRPC(scheduler_stub_->SchedulerInfo(&context, request, &reply));
}
void Worker::task_info(ClientContext &context, TaskInfoRequest &request, TaskInfoReply &reply) {
RAY_CHECK(connected_, "Attempted to get worker info but failed.");
RAY_CHECK_GRPC(scheduler_stub_->TaskInfo(&context, request, &reply));
}
std::vector<int> Worker::wait(std::vector<ObjectID>& objectids) {
RAY_CHECK(connected_, "Attempted to test if object was ready but failed.");
ClientContext context;
WaitRequest request;
WaitReply reply;
for (int i = 0; i < objectids.size(); ++i) {
request.add_objectids(objectids[i]);
}
RAY_CHECK_GRPC(scheduler_stub_->Wait(&context, request, &reply));
std::vector<int> result;
for (int i = 0; i < reply.indices_size(); ++i) {
result.push_back(reply.indices(i));
}
return result;
}
void Worker::run_function_on_all_workers(const std::string& function) {
RAY_CHECK(connected_, "Attempted to run function on all workers but failed.");
ClientContext context;
RunFunctionOnAllWorkersRequest request;
request.mutable_function()->set_implementation(function);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->RunFunctionOnAllWorkers(&context, request, &reply));
}
bool Worker::export_remote_function(const std::string& function_name, const std::string& function) {
RAY_CHECK(connected_, "Attempted to export function but failed.");
ClientContext context;
ExportRemoteFunctionRequest request;
request.mutable_function()->set_name(function_name);
request.mutable_function()->set_implementation(function);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->ExportRemoteFunction(&context, request, &reply));
return true;
}
void Worker::export_reusable_variable(const std::string& name, const std::string& initializer, const std::string& reinitializer) {
RAY_CHECK(connected_, "Attempted to export reusable variable but failed.");
ClientContext context;
ExportReusableVariableRequest request;
request.mutable_reusable_variable()->set_name(name);
request.mutable_reusable_variable()->mutable_initializer()->set_implementation(initializer);
request.mutable_reusable_variable()->mutable_reinitializer()->set_implementation(reinitializer);
AckReply reply;
RAY_CHECK_GRPC(scheduler_stub_->ExportReusableVariable(&context, request, &reply));
}
// Communication between the WorkerServer and the Worker happens via a message
// queue. This is because the Python interpreter needs to be single threaded
// (in our case running in the main thread), whereas the WorkerService will
// run in a separate thread and potentially utilize multiple threads.
void Worker::start_worker_service(Mode mode) {
// Use atomics so the worker service thread can signal the outside thread that
// the worker service has been started.
std::atomic_bool worker_service_started;
worker_service_started.store(false);
// Launch a new thread for running the worker service. We store this as a
// field so that we can clean it up when we disconnect the worker.
worker_server_thread_ = std::thread([this, mode, &worker_service_started]() {
// Create the worker service.
WorkerServiceImpl service(receive_queue_name_, mode);
ServerBuilder builder;
// Let GRPC choose an unused port.
int port;
builder.AddListeningPort(std::string("0.0.0.0:0"), grpc::InsecureServerCredentials(), &port);
builder.RegisterService(&service);
std::unique_ptr<Server> server(builder.BuildAndStart());
if (server == nullptr) {
RAY_CHECK(false, "Failed to create the worker service.");
}
worker_address_ = node_ip_address_ + ":" + std::to_string(port);
server_ptr_ = server.get();
RAY_LOG(RAY_INFO, "worker server listening at " << worker_address_);
worker_service_started.store(true);
// Wait for work and process work. This method does not return until
// Shutdown is called from a different thread.
server->Wait();
RAY_LOG(RAY_INFO, "Worker service thread returning.")
});
// Wait for the worker service to start. This essentially implements a
// condition variable using atomics, but that failed on Mac OS X on Travis.
while (!worker_service_started.load()) {
RAY_LOG(RAY_INFO, "Looping while waiting for the worker service to start.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
}
-145
View File
@@ -1,145 +0,0 @@
#ifndef RAY_WORKER_H
#define RAY_WORKER_H
#include <iostream>
#include <memory>
#include <string>
#include <thread>
#include <grpc++/grpc++.h>
#include <Python.h>
using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerContext;
using grpc::Status;
#include "ray.grpc.pb.h"
#include "ray/ray.h"
#include "ipc.h"
using grpc::Channel;
using grpc::ClientContext;
using grpc::ClientWriter;
// These three constants are used to define the mode that a worker is running
// in. Right now, this is mostly used for determining how to print information
// about task failures.
enum Mode {SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE};
class WorkerServiceImpl final : public WorkerService::Service {
public:
WorkerServiceImpl(const std::string& worker_address, Mode mode);
Status ExecuteTask(ServerContext* context, const ExecuteTaskRequest* request, AckReply* reply) override;
Status RunFunctionOnWorker(ServerContext* context, const RunFunctionOnWorkerRequest* request, AckReply* reply) override;
Status ImportRemoteFunction(ServerContext* context, const ImportRemoteFunctionRequest* request, AckReply* reply) override;
Status Die(ServerContext* context, const DieRequest* request, AckReply* reply) override;
Status ImportReusableVariable(ServerContext* context, const ImportReusableVariableRequest* request, AckReply* reply) override;
Status PrintErrorMessage(ServerContext* context, const PrintErrorMessageRequest* request, AckReply* reply) override;
private:
// The queue used to send commands from the worker service to the worker. This
// corresponds to the receive_queue_ in the worker.
MessageQueue<WorkerMessage*> send_queue_;
// This is true if the worker service is part of a driver process and false
// if it is part of a worker process.
Mode mode_;
};
class Worker {
public:
Worker(const std::string& node_ip_address, const std::string& scheduler_address, Mode mode);
// Submit a remote task to the scheduler. If the function in the task is not
// registered with the scheduler, we will sleep for retry_wait_milliseconds
// and try to resubmit the task to the scheduler up to max_retries more times.
SubmitTaskReply submit_task(SubmitTaskRequest* request, int max_retries = 10, int retry_wait_milliseconds = 500);
// Requests the scheduler to kill workers
bool kill_workers(ClientContext &context);
// send request to the scheduler to register this worker
void register_worker(const std::string& ip_address, const std::string& objstore_address, bool is_driver);
// get a new object ID that is registered with the scheduler
ObjectID get_objectid();
// request an object to be delivered to the local object store
void request_object(ObjectID objectid);
// Notify the scheduler about the object IDs contained within a remote object.
void add_contained_objectids(ObjectID objectid, std::vector<ObjectID> &contained_objectids);
// Allocates buffer for objectid with size of size
const char* allocate_buffer(ObjectID objectid, int64_t size, SegmentId& segmentid);
// Finishes buffer with segmentid and an offset of metadata_ofset
PyObject* finish_buffer(ObjectID objectid, SegmentId segmentid, int64_t metadata_offset);
// Gets the buffer for objectid
const char* get_buffer(ObjectID objectid, int64_t& size, SegmentId& segmentid, int64_t& metadata_offset);
// determine if the object stored in objectid is an arrow object // TODO(pcm): more general mechanism for this?
bool is_arrow(ObjectID objectid);
// unmap the segment containing an object from the local address space
void unmap_object(ObjectID objectid);
// make `alias_objectid` refer to the same object that `target_objectid` refers to
void alias_objectids(ObjectID alias_objectid, ObjectID target_objectid);
// increment the reference count for objectid
void increment_reference_count(std::vector<ObjectID> &objectid);
// decrement the reference count for objectid
void decrement_reference_count(std::vector<ObjectID> &objectid);
// Notify the scheduler that a remote function has been imported successfully.
void register_remote_function(const std::string& name, size_t num_return_vals);
// Notify the scheduler that a failure has occurred.
void notify_failure(FailedType type, const std::string& name, const std::string& error_message);
// Start the worker server which accepts commands from the scheduler. For
// workers, these commands are stored in the message queue, which is read by
// the Python interpreter. For drivers, these commands are only for printing
// error messages.
void start_worker_service(Mode mode);
// wait for next task from the RPC system. If null, it means there are no more tasks and the worker should shut down.
std::unique_ptr<WorkerMessage> receive_next_message();
// Tell the scheduler that the worker is ready for a new task.
void ready_for_new_task();
// disconnect the worker
void disconnect();
// return connected_
bool connected() { return connected_; }
// get info about scheduler state
void scheduler_info(ClientContext &context, SchedulerInfoRequest &request, SchedulerInfoReply &reply);
// get task statuses from scheduler
void task_info(ClientContext &context, TaskInfoRequest &request, TaskInfoReply &reply);
// gets indices of available objects
std::vector<int> wait(std::vector<ObjectID>& objectids);
// Export a function to be run on all workers.
void run_function_on_all_workers(const std::string& function);
// export function to workers
bool export_remote_function(const std::string& function_name, const std::string& function);
// export reusable variable to workers
void export_reusable_variable(const std::string& name, const std::string& initializer, const std::string& reinitializer);
// return the worker address
const char* get_worker_address() { return worker_address_.c_str(); }
private:
Mode mode_;
bool connected_;
const size_t CHUNK_SIZE = 8 * 1024;
std::unique_ptr<Scheduler::Stub> scheduler_stub_;
Server* server_ptr_;
std::thread worker_server_thread_;
bip::managed_shared_memory segment_;
WorkerId workerid_;
ObjStoreId objstoreid_;
std::string scheduler_address_;
std::string objstore_address_;
std::string worker_address_;
std::string node_ip_address_;
// The queue used to send commands from the worker service to the worker.
// This queue is created by the worker. This corresponds to the send_queue_ in
// the worker service.
MessageQueue<WorkerMessage*> receive_queue_;
// The name of the receive queue.
std::string receive_queue_name_;
// The queue used to send requests to the object store. There is a single
// queue shared by all workers sending requests to the object store, and this
// queue is created by the object store.
MessageQueue<ObjRequest> request_obj_queue_;
// The queue used to receive object addresses from the object store. This
// queue is created by this worker.
MessageQueue<ObjHandle> receive_obj_queue_;
std::shared_ptr<MemorySegmentPool> segmentpool_;
};
#endif
-1
Submodule thirdparty/grpc deleted from 2a69139aa7
Submodule thirdparty/numbuf deleted from 7055c6f793
Submodule thirdparty/python deleted from 3f8fa00528