diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..bda3778bb --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 2.8) + +project(numbuf) + +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) + +find_package(PythonInterp REQUIRED) +find_package(PythonLibs REQUIRED) +find_package(NumPy REQUIRED) + +include_directories("${PYTHON_INCLUDE_DIRS}") +include_directories("${NUMPY_INCLUDE_DIR}") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +if (UNIX AND NOT APPLE) + link_libraries(rt) +endif() + +set(ARROW_DIR "${CMAKE_SOURCE_DIR}/../arrow/" CACHE STRING + "Path of the arrow source directory") + +set(ARROW_STATIC_LIB "${CMAKE_SOURCE_DIR}/../arrow/cpp/build/debug/libarrow.so" CACHE STRING + "Path to libarrow.a (needs to be changed if arrow is build in debug mode)") + +include_directories("${ARROW_DIR}/cpp/src/") +include_directories("cpp/src/") +include_directories("python/src/") + +add_definitions(-fPIC) + +add_library(numbuf SHARED + cpp/src/numbuf/tensor.cc + cpp/src/numbuf/dict.cc + cpp/src/numbuf/sequence.cc + python/src/pynumbuf/numbuf.cc + python/src/pynumbuf/adapters/numpy.cc + python/src/pynumbuf/adapters/python.cc) + +target_link_libraries(numbuf ${ARROW_STATIC_LIB} ${PYTHON_LIBRARIES}) diff --git a/cmake/Modules/FindNumPy.cmake b/cmake/Modules/FindNumPy.cmake new file mode 100644 index 000000000..6e1f3c415 --- /dev/null +++ b/cmake/Modules/FindNumPy.cmake @@ -0,0 +1,54 @@ +# - Find the NumPy libraries +# This module finds if NumPy is installed, and sets the following variables +# indicating where it is. +# +# +# NUMPY_FOUND - was NumPy found +# NUMPY_VERSION - the version of NumPy found as a string +# NUMPY_VERSION_MAJOR - the major version number of NumPy +# NUMPY_VERSION_MINOR - the minor version number of NumPy +# NUMPY_VERSION_PATCH - the patch version number of NumPy +# NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is 10601 +# NUMPY_INCLUDE_DIR - path to the NumPy include files + +unset(NUMPY_VERSION) +unset(NUMPY_INCLUDE_DIR) + +if(PYTHONINTERP_FOUND) + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import numpy as n; print(n.__version__); print(n.get_include());" + RESULT_VARIABLE __result + OUTPUT_VARIABLE __output + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(__result MATCHES 0) + string(REGEX REPLACE ";" "\\\\;" __values ${__output}) + string(REGEX REPLACE "\r?\n" ";" __values ${__values}) + list(GET __values 0 NUMPY_VERSION) + list(GET __values 1 NUMPY_INCLUDE_DIR) + + string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}") + if(NOT "${__ver_check}" STREQUAL "") + set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1}) + set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2}) + set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3}) + math(EXPR NUMPY_VERSION_DECIMAL + "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}") + string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR}) + else() + unset(NUMPY_VERSION) + unset(NUMPY_INCLUDE_DIR) + message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n") + endif() + endif() +else() + message(STATUS "To find NumPy Python interpretator is required to be found.") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION + VERSION_VAR NUMPY_VERSION) + +if(NUMPY_FOUND) + message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})") +endif() diff --git a/cpp/src/numbuf/dict.cc b/cpp/src/numbuf/dict.cc new file mode 100644 index 000000000..ff5f7239f --- /dev/null +++ b/cpp/src/numbuf/dict.cc @@ -0,0 +1,23 @@ +#include "dict.h" + +using namespace arrow; + +namespace numbuf { + +std::shared_ptr DictBuilder::Finish( + std::shared_ptr list_data, + std::shared_ptr tuple_data, + std::shared_ptr dict_data) { + // lists and dicts can't be keys of dicts in Python, that is why for + // the keys we do not need to collect sublists + auto keys = keys_.Finish(nullptr, nullptr, nullptr); + auto vals = vals_.Finish(list_data, tuple_data, dict_data); + auto keys_field = std::make_shared("keys", keys->type()); + auto vals_field = std::make_shared("vals", vals->type()); + auto type = std::make_shared(std::vector({keys_field, vals_field})); + std::vector field_arrays({keys, vals}); + DCHECK(keys->length() == vals->length()); + return std::make_shared(type, keys->length(), field_arrays); +} + +} diff --git a/cpp/src/numbuf/dict.h b/cpp/src/numbuf/dict.h new file mode 100644 index 000000000..5c2abeacd --- /dev/null +++ b/cpp/src/numbuf/dict.h @@ -0,0 +1,48 @@ +#ifndef NUMBUF_DICT_H +#define NUMBUF_DICT_H + +#include + +#include "sequence.h" + +namespace numbuf { + +/*! Constructing dictionaries of key/value pairs. Sequences of + keys and values are built separately using a pair of + SequenceBuilders. The resulting Arrow representation + can be obtained via the Finish method. +*/ +class DictBuilder { +public: + DictBuilder(arrow::MemoryPool* pool = nullptr) + : keys_(pool), vals_(pool) {} + + //! Builder for the keys of the dictionary + SequenceBuilder& keys() { return keys_; } + //! Builder for the values of the dictionary + SequenceBuilder& vals() { return vals_; } + + /*! Construct an Arrow StructArray representing the dictionary. + Contains a field "keys" for the keys and "vals" for the values. + + \param list_data + List containing the data from nested lists in the value + list of the dictionary + + \param dict_data + List containing the data from nested dictionaries in the + value list of the dictionary + */ + std::shared_ptr Finish( + std::shared_ptr list_data, + std::shared_ptr tuple_data, + std::shared_ptr dict_data); + +private: + SequenceBuilder keys_; + SequenceBuilder vals_; +}; + +} + +#endif diff --git a/cpp/src/numbuf/sequence.cc b/cpp/src/numbuf/sequence.cc new file mode 100644 index 000000000..ae250180f --- /dev/null +++ b/cpp/src/numbuf/sequence.cc @@ -0,0 +1,167 @@ +#include "sequence.h" + +using namespace arrow; + +namespace numbuf { + +SequenceBuilder::SequenceBuilder(MemoryPool* pool) + : pool_(pool), types_(pool), offsets_(pool), + nones_(pool, std::make_shared()), + bools_(pool, std::make_shared()), + ints_(pool), strings_(pool, std::make_shared()), + floats_(pool), doubles_(pool), + uint8_tensors_(std::make_shared(), pool), + int8_tensors_(std::make_shared(), pool), + uint16_tensors_(std::make_shared(), pool), + int16_tensors_(std::make_shared(), pool), + uint32_tensors_(std::make_shared(), pool), + int32_tensors_(std::make_shared(), pool), + uint64_tensors_(std::make_shared(), pool), + int64_tensors_(std::make_shared(), pool), + float_tensors_(std::make_shared(), pool), + double_tensors_(std::make_shared(), pool), + list_offsets_({0}), tuple_offsets_({0}), dict_offsets_({0}) {} + +#define UPDATE(OFFSET, TAG) \ + if (TAG == -1) { \ + TAG = num_tags; \ + num_tags += 1; \ + } \ + RETURN_NOT_OK(offsets_.Append(OFFSET)); \ + RETURN_NOT_OK(types_.Append(TAG)); \ + RETURN_NOT_OK(nones_.AppendToBitmap(true)); + +Status SequenceBuilder::Append() { + RETURN_NOT_OK(offsets_.Append(0)); + RETURN_NOT_OK(types_.Append(0)); + return nones_.AppendToBitmap(false); +} + +Status SequenceBuilder::Append(bool data) { + UPDATE(bools_.length(), bool_tag); + return bools_.Append(data); +} + +Status SequenceBuilder::Append(int64_t data) { + UPDATE(ints_.length(), int_tag); + return ints_.Append(data); +} + +Status SequenceBuilder::Append(uint64_t data) { + UPDATE(ints_.length(), int_tag); + return ints_.Append(data); +} + +Status SequenceBuilder::Append(const char* data, int32_t length) { + UPDATE(strings_.length(), string_tag); + return strings_.Append(data, length); +} + +Status SequenceBuilder::Append(float data) { + UPDATE(floats_.length(), float_tag); + return floats_.Append(data); +} + +Status SequenceBuilder::Append(double data) { + UPDATE(doubles_.length(), double_tag); + return doubles_.Append(data); +} + +#define DEF_TENSOR_APPEND(NAME, TYPE, TAG) \ + Status SequenceBuilder::Append(const std::vector& dims, TYPE* data) { \ + UPDATE(NAME.length(), TAG); \ + return NAME.Append(dims, data); \ + } + +DEF_TENSOR_APPEND(uint8_tensors_, uint8_t, uint8_tensor_tag); +DEF_TENSOR_APPEND(int8_tensors_, int8_t, int8_tensor_tag); +DEF_TENSOR_APPEND(uint16_tensors_, uint16_t, uint16_tensor_tag); +DEF_TENSOR_APPEND(int16_tensors_, int16_t, int16_tensor_tag); +DEF_TENSOR_APPEND(uint32_tensors_, uint32_t, uint32_tensor_tag); +DEF_TENSOR_APPEND(int32_tensors_, int32_t, int32_tensor_tag); +DEF_TENSOR_APPEND(uint64_tensors_, uint64_t, uint64_tensor_tag); +DEF_TENSOR_APPEND(int64_tensors_, int64_t, int64_tensor_tag); +DEF_TENSOR_APPEND(float_tensors_, float, float_tensor_tag); +DEF_TENSOR_APPEND(double_tensors_, double, double_tensor_tag); + +Status SequenceBuilder::AppendList(int32_t size) { + UPDATE(list_offsets_.size() - 1, list_tag); + list_offsets_.push_back(list_offsets_.back() + size); + return Status::OK(); +} + +Status SequenceBuilder::AppendTuple(int32_t size) { + UPDATE(tuple_offsets_.size() - 1, tuple_tag); + tuple_offsets_.push_back(tuple_offsets_.back() + size); + return Status::OK(); +} + +Status SequenceBuilder::AppendDict(int32_t size) { + UPDATE(dict_offsets_.size() - 1, dict_tag); + dict_offsets_.push_back(dict_offsets_.back() + size); + return Status::OK(); +} + +#define ADD_ELEMENT(VARNAME, TAG) \ + if (TAG != -1) { \ + types[TAG] = VARNAME.type(); \ + children[TAG] = VARNAME.Finish(); \ + ARROW_CHECK_OK(nones_.AppendToBitmap(true)); \ + } + +#define ADD_SUBSEQUENCE(DATA, OFFSETS, BUILDER, TAG, NAME) \ + if (DATA) { \ + DCHECK(DATA->length() == OFFSETS.back()); \ + auto list_builder = std::make_shared(pool_, DATA); \ + auto field = std::make_shared(NAME, list_builder->type()); \ + auto type = std::make_shared(std::vector({field})); \ + auto lists = std::vector>({list_builder}); \ + StructBuilder builder(pool_, type, lists); \ + OFFSETS.pop_back(); \ + ARROW_CHECK_OK(list_builder->Append(OFFSETS.data(), OFFSETS.size())); \ + builder.Append(); \ + ADD_ELEMENT(builder, TAG); \ + } else { \ + DCHECK(OFFSETS.size() == 1); \ + } + +std::shared_ptr SequenceBuilder::Finish( + std::shared_ptr list_data, + std::shared_ptr tuple_data, + std::shared_ptr dict_data) { + + std::vector types(num_tags); + std::vector children(num_tags); + + ADD_ELEMENT(bools_, bool_tag); + ADD_ELEMENT(ints_, int_tag); + ADD_ELEMENT(strings_, string_tag); + ADD_ELEMENT(floats_, float_tag); + ADD_ELEMENT(doubles_, double_tag); + + ADD_ELEMENT(uint8_tensors_, uint8_tensor_tag); + + ADD_ELEMENT(int8_tensors_, int8_tensor_tag); + ADD_ELEMENT(uint16_tensors_, uint16_tensor_tag); + ADD_ELEMENT(int16_tensors_, int16_tensor_tag); + ADD_ELEMENT(uint32_tensors_, uint32_tensor_tag); + + ADD_ELEMENT(int32_tensors_, int32_tensor_tag); + ADD_ELEMENT(uint64_tensors_, uint64_tensor_tag); + ADD_ELEMENT(int64_tensors_, int64_tensor_tag); + + ADD_ELEMENT(float_tensors_, float_tensor_tag); + ADD_ELEMENT(double_tensors_, double_tensor_tag); + + ADD_SUBSEQUENCE(list_data, list_offsets_, list_builder, list_tag, "list"); + ADD_SUBSEQUENCE(tuple_data, tuple_offsets_, tuple_builder, tuple_tag, "tuple"); + ADD_SUBSEQUENCE(dict_data, dict_offsets_, dict_builder, dict_tag, "dict"); + + TypePtr type = TypePtr(new DenseUnionType(types)); + + return std::make_shared(type, types_.length(), + children, types_.data(), offsets_.data(), + nones_.null_count(), nones_.null_bitmap()); +} + +} diff --git a/cpp/src/numbuf/sequence.h b/cpp/src/numbuf/sequence.h new file mode 100644 index 000000000..f30ea00fe --- /dev/null +++ b/cpp/src/numbuf/sequence.h @@ -0,0 +1,138 @@ +#ifndef NUMBUF_LIST_H +#define NUMBUF_LIST_H + +#include +#include +#include "tensor.h" + +namespace numbuf { + +/*! A Sequence is a heterogeneous collections of elements. It can contain + scalar Python types, lists, tuples, dictionaries and tensors. +*/ +class SequenceBuilder { + public: + SequenceBuilder(arrow::MemoryPool* pool = nullptr); + + //! Appending a none to the sequence + arrow::Status Append(); + + //! Appending a boolean to the sequence + arrow::Status Append(bool data); + + //! Appending an int64_t to the sequence + arrow::Status Append(int64_t data); + + //! Appending an uint64_t to the sequence + arrow::Status Append(uint64_t data); + + //! Appending a string to the sequence + arrow::Status Append(const char* data, int32_t length); + + //! Appending a float to the sequence + arrow::Status Append(float data); + + //! Appending a double to the sequence + arrow::Status Append(double data); + + /*! Appending a tensor to the sequence + + \param dims + A vector of dimensions + + \param data + A pointer to the start of the data block. The length of the data block + will be the product of the dimensions + */ + arrow::Status Append(const std::vector& dims, uint8_t* data); + arrow::Status Append(const std::vector& dims, int8_t* data); + arrow::Status Append(const std::vector& dims, uint16_t* data); + arrow::Status Append(const std::vector& dims, int16_t* data); + arrow::Status Append(const std::vector& dims, uint32_t* data); + arrow::Status Append(const std::vector& dims, int32_t* data); + arrow::Status Append(const std::vector& dims, uint64_t* data); + arrow::Status Append(const std::vector& dims, int64_t* data); + arrow::Status Append(const std::vector& dims, float* data); + arrow::Status Append(const std::vector& dims, double* data); + + /*! Add a sublist to the sequenc. The data contained in the sublist will be + specified in the "Finish" method. + + To construct l = [[11, 22], 33, [44, 55]] you would for example run + list = ListBuilder(); + list.AppendList(2); + list.Append(33); + list.AppendList(2); + list.Finish([11, 22, 44, 55]); + list.Finish(); + + \param size + The size of the sublist + */ + arrow::Status AppendList(int32_t size); + + arrow::Status AppendTuple(int32_t size); + + arrow::Status AppendDict(int32_t size); + + //! Finish building the sequence and return the result + std::shared_ptr Finish( + std::shared_ptr list_data, + std::shared_ptr tuple_data, + std::shared_ptr dict_data); + + private: + arrow::MemoryPool* pool_; + + arrow::Int8Builder types_; + arrow::Int32Builder offsets_; + + arrow::NullArrayBuilder nones_; + arrow::BooleanBuilder bools_; + arrow::Int64Builder ints_; + arrow::StringBuilder strings_; + arrow::FloatBuilder floats_; + arrow::DoubleBuilder doubles_; + + UInt8TensorBuilder uint8_tensors_; + Int8TensorBuilder int8_tensors_; + UInt16TensorBuilder uint16_tensors_; + Int16TensorBuilder int16_tensors_; + UInt32TensorBuilder uint32_tensors_; + Int32TensorBuilder int32_tensors_; + UInt64TensorBuilder uint64_tensors_; + Int64TensorBuilder int64_tensors_; + FloatTensorBuilder float_tensors_; + DoubleTensorBuilder double_tensors_; + + std::vector list_offsets_; + std::vector tuple_offsets_; + std::vector dict_offsets_; + + int8_t bool_tag = -1; + int8_t int_tag = -1; + int8_t string_tag = -1; + int8_t float_tag = -1; + int8_t double_tag = -1; + + int8_t uint8_tensor_tag = -1; + int8_t int8_tensor_tag = -1; + int8_t uint16_tensor_tag = -1; + int8_t int16_tensor_tag = -1; + int8_t uint32_tensor_tag = -1; + int8_t int32_tensor_tag = -1; + int8_t uint64_tensor_tag = -1; + int8_t int64_tensor_tag = -1; + int8_t float_tensor_tag = -1; + int8_t double_tensor_tag = -1; + + int8_t list_tag = -1; + int8_t tuple_tag = -1; + int8_t dict_tag = -1; + + int8_t num_tags = 0; +}; + +} // namespace numbuf + +#endif // NUMBUF_LIST_H diff --git a/cpp/src/numbuf/tensor.cc b/cpp/src/numbuf/tensor.cc new file mode 100644 index 000000000..f0f7647de --- /dev/null +++ b/cpp/src/numbuf/tensor.cc @@ -0,0 +1,50 @@ +#include "tensor.h" + +using namespace arrow; + +namespace numbuf { + +template +TensorBuilder::TensorBuilder(const TypePtr& dtype, MemoryPool* pool) + : dtype_(dtype) { + dim_data_ = std::make_shared(pool); + dims_ = std::make_shared(pool, dim_data_); + value_data_ = std::make_shared>(pool, dtype); + values_ = std::make_shared(pool, value_data_); + auto dims_field = std::make_shared("dims", dims_->type()); + auto values_field = std::make_shared("data", values_->type()); + auto type = std::make_shared(std::vector({dims_field, values_field})); + tensors_ = std::make_shared(pool, type, std::vector>({dims_, values_})); +}; + +template +Status TensorBuilder::Append(const std::vector& dims, const elem_type* data) { + RETURN_NOT_OK(tensors_->Append()); + RETURN_NOT_OK(dims_->Append()); + RETURN_NOT_OK(values_->Append()); + int32_t size = 1; + for (auto dim : dims) { + size *= dim; + RETURN_NOT_OK(dim_data_->Append(dim)); + } + RETURN_NOT_OK(value_data_->Append(data, size)); + return Status::OK(); // tensors_->Append(); +} + +template +std::shared_ptr TensorBuilder::Finish() { + return tensors_->Finish(); +} + +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; +template class TensorBuilder; + +} diff --git a/cpp/src/numbuf/tensor.h b/cpp/src/numbuf/tensor.h new file mode 100644 index 000000000..351892758 --- /dev/null +++ b/cpp/src/numbuf/tensor.h @@ -0,0 +1,67 @@ +#ifndef NUMBUF_TENSOR_H +#define NUMBUF_TENSOR_H + +#include +#include +#include + +namespace numbuf { + +/*! This is a class for building a dataframe where each row corresponds to + a Tensor (= multidimensional array) of numerical data. There are two + columns, "dims" which contains an array of dimensions for each Tensor + and "data" which contains data buffer of the Tensor as a flattened array. +*/ +template +class TensorBuilder { +public: + typedef typename T::c_type elem_type; + + TensorBuilder(const arrow::TypePtr& dtype, arrow::MemoryPool* pool = nullptr); + + /*! Append a new tensor. + + \param dims + The dimensions of the Tensor + + \param data + Pointer to the beginning of the data buffer of the Tensor. The + total length of the buffer is sizeof(elem_type) * product of dims[i] over i + */ + arrow::Status Append(const std::vector& dims, const elem_type* data); + + //! Convert the tensors to an Arrow StructArray + std::shared_ptr Finish(); + + //! Number of tensors in the column + int32_t length() { + return tensors_->length(); + } + + const arrow::TypePtr& type() { + return tensors_->type(); + } + +private: + arrow::TypePtr dtype_; + std::shared_ptr dim_data_; + std::shared_ptr dims_; + std::shared_ptr> value_data_; + std::shared_ptr values_; + std::shared_ptr tensors_; +}; + +typedef TensorBuilder UInt8TensorBuilder; +typedef TensorBuilder Int8TensorBuilder; +typedef TensorBuilder UInt16TensorBuilder; +typedef TensorBuilder Int16TensorBuilder; +typedef TensorBuilder UInt32TensorBuilder; +typedef TensorBuilder Int32TensorBuilder; +typedef TensorBuilder UInt64TensorBuilder; +typedef TensorBuilder Int64TensorBuilder; +typedef TensorBuilder FloatTensorBuilder; +typedef TensorBuilder DoubleTensorBuilder; + +} + +#endif // NUMBUF_TENSOR_H diff --git a/python/src/pynumbuf/adapters/numpy.cc b/python/src/pynumbuf/adapters/numpy.cc new file mode 100644 index 000000000..6ca6753ca --- /dev/null +++ b/python/src/pynumbuf/adapters/numpy.cc @@ -0,0 +1,88 @@ +#include "numpy.h" + +#include + +using namespace arrow; + +namespace numbuf { + +#define ARROW_TYPE_TO_NUMPY_CASE(TYPE) \ + case Type::TYPE: \ + return NPY_##TYPE; + +#define DESERIALIZE_ARRAY_CASE(TYPE, ArrayType, type) \ + case Type::TYPE: { \ + auto values = std::dynamic_pointer_cast(content->values()); \ + DCHECK(values); \ + type* data = const_cast(values->raw_data()) \ + + content->offset(offset); \ + *out = PyArray_SimpleNewFromData(num_dims, dim.data(), NPY_##TYPE, \ + reinterpret_cast(data)); \ + } \ + return Status::OK(); + +Status DeserializeArray(std::shared_ptr array, int32_t offset, PyObject** out) { + DCHECK(array); + auto tensor = std::dynamic_pointer_cast(array); + DCHECK(tensor); + auto dims = std::dynamic_pointer_cast(tensor->field(0)); + auto content = std::dynamic_pointer_cast(tensor->field(1)); + npy_intp num_dims = dims->value_length(offset); + std::vector dim(num_dims); + for (int i = dims->offset(offset); i < dims->offset(offset+1); ++i) { + dim[i - dims->offset(offset)] = + std::dynamic_pointer_cast(dims->values())->Value(i); + } + switch (content->value_type()->type) { + DESERIALIZE_ARRAY_CASE(INT8, Int8Array, int8_t) + DESERIALIZE_ARRAY_CASE(INT16, Int16Array, int16_t) + DESERIALIZE_ARRAY_CASE(INT32, Int32Array, int32_t) + DESERIALIZE_ARRAY_CASE(INT64, Int64Array, int64_t) + DESERIALIZE_ARRAY_CASE(UINT8, UInt8Array, uint8_t) + DESERIALIZE_ARRAY_CASE(UINT16, UInt16Array, uint16_t) + DESERIALIZE_ARRAY_CASE(UINT32, UInt32Array, uint32_t) + DESERIALIZE_ARRAY_CASE(UINT64, UInt64Array, uint64_t) + DESERIALIZE_ARRAY_CASE(FLOAT, FloatArray, float) + DESERIALIZE_ARRAY_CASE(DOUBLE, DoubleArray, double) + default: + DCHECK(false) << "arrow type not recognized: " << content->value_type()->type; + } + return Status::OK(); +} + +Status SerializeArray(PyArrayObject* array, SequenceBuilder& builder) { + size_t ndim = PyArray_NDIM(array); + int dtype = PyArray_TYPE(array); + std::vector dims(ndim); + for (int i = 0; i < ndim; ++i) { + dims[i] = PyArray_DIM(array, i); + } + auto data = PyArray_DATA(array); + switch (dtype) { + case NPY_UINT8: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_INT8: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_UINT16: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_INT16: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_UINT32: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_INT32: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_UINT64: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_INT64: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_FLOAT: + return builder.Append(dims, reinterpret_cast(data)); + case NPY_DOUBLE: + return builder.Append(dims, reinterpret_cast(data)); + default: + DCHECK(false) << "numpy data type not recognized: " << dtype; + } + return Status::OK(); +} + +} diff --git a/python/src/pynumbuf/adapters/numpy.h b/python/src/pynumbuf/adapters/numpy.h new file mode 100644 index 000000000..8b8b3859a --- /dev/null +++ b/python/src/pynumbuf/adapters/numpy.h @@ -0,0 +1,22 @@ +#ifndef PYNUMBUF_NUMPY_H +#define PYNUMBUF_NUMPY_H + +#include +#include + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API +#include + +#include +#include + +namespace numbuf { + +arrow::Status SerializeArray(PyArrayObject* array, SequenceBuilder& builder); +arrow::Status DeserializeArray(std::shared_ptr array, int32_t offset, PyObject** out); + +} + +#endif diff --git a/python/src/pynumbuf/adapters/python.cc b/python/src/pynumbuf/adapters/python.cc new file mode 100644 index 000000000..751ebf91b --- /dev/null +++ b/python/src/pynumbuf/adapters/python.cc @@ -0,0 +1,193 @@ +#include "python.h" + +#include + +#include "scalars.h" + +using namespace arrow; + +namespace numbuf { + +PyObject* get_value(ArrayPtr arr, int32_t index, int32_t type) { + PyObject* result; + switch (arr->type()->type) { + case Type::BOOL: + return PyBool_FromLong(std::static_pointer_cast(arr)->Value(index)); + case Type::INT64: + return PyInt_FromLong(std::static_pointer_cast(arr)->Value(index)); + case Type::STRING: { + int32_t nchars; + const uint8_t* str = std::static_pointer_cast(arr)->GetValue(index, &nchars); + return PyString_FromStringAndSize(reinterpret_cast(str), nchars); + } + case Type::FLOAT: + return PyFloat_FromDouble(std::static_pointer_cast(arr)->Value(index)); + case Type::DOUBLE: + return PyFloat_FromDouble(std::static_pointer_cast(arr)->Value(index)); + case Type::STRUCT: { + auto s = std::static_pointer_cast(arr); + auto l = std::static_pointer_cast(s->field(0)); + if (s->type()->child(0)->name == "list") { + ARROW_CHECK_OK(DeserializeList(l->values(), l->value_offset(index), l->value_offset(index+1), &result)); + } else if (s->type()->child(0)->name == "tuple") { + ARROW_CHECK_OK(DeserializeTuple(l->values(), l->value_offset(index), l->value_offset(index+1), &result)); + } else if (s->type()->child(0)->name == "dict") { + ARROW_CHECK_OK(DeserializeDict(l->values(), l->value_offset(index), l->value_offset(index+1), &result)); + } else { + ARROW_CHECK_OK(DeserializeArray(arr, index, &result)); + } + return result; + } + default: + DCHECK(false) << "union tag not recognized " << type; + } + return NULL; +} + +Status append(PyObject* elem, SequenceBuilder& builder, + std::vector& sublists, + std::vector& subtuples, + std::vector& subdicts) { + // The bool case must precede the int case (PyInt_Check passes for bools) + if (PyBool_Check(elem)) { + RETURN_NOT_OK(builder.Append(elem == Py_True)); + } else if (PyFloat_Check(elem)) { + RETURN_NOT_OK(builder.Append(PyFloat_AS_DOUBLE(elem))); + } else if (PyLong_Check(elem)) { + int overflow = 0; + int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow); + RETURN_NOT_OK(builder.Append(data)); + if(overflow) { + return Status::NotImplemented("long overflow"); + } + } else if (PyInt_Check(elem)) { + RETURN_NOT_OK(builder.Append(PyInt_AS_LONG(elem))); + } else if (PyString_Check(elem)) { + RETURN_NOT_OK(builder.Append(PyString_AS_STRING(elem), PyString_GET_SIZE(elem))); + } else if (PyList_Check(elem)) { + builder.AppendList(PyList_Size(elem)); + sublists.push_back(elem); + } else if (PyDict_Check(elem)) { + builder.AppendDict(PyDict_Size(elem)); + subdicts.push_back(elem); + } else if (PyTuple_Check(elem)) { + builder.AppendTuple(PyTuple_Size(elem)); + subtuples.push_back(elem); + } else if (PyArray_IsScalar(elem, Generic)) { + RETURN_NOT_OK(AppendScalar(elem, builder)); + } else if (PyArray_Check(elem)) { + RETURN_NOT_OK(SerializeArray((PyArrayObject*) elem, builder)); + } else if (elem == Py_None) { + RETURN_NOT_OK(builder.Append()); + } else { + std::stringstream ss; + ss << "data type of " << PyString_AS_STRING(PyObject_Repr(elem)) + << " not recognized"; + return Status::NotImplemented(ss.str()); + } + return Status::OK(); +} + +Status SerializeSequences(std::vector sequences, std::shared_ptr* out) { + DCHECK(out); + SequenceBuilder builder(nullptr); + std::vector sublists, subtuples, subdicts; + for (const auto& sequence : sequences) { + PyObject* item; + PyObject* iterator = PyObject_GetIter(sequence); + while (item = PyIter_Next(iterator)) { + RETURN_NOT_OK(append(item, builder, sublists, subtuples, subdicts)); + Py_DECREF(item); + } + Py_DECREF(iterator); + } + std::shared_ptr list; + if (sublists.size() > 0) { + RETURN_NOT_OK(SerializeSequences(sublists, &list)); + } + std::shared_ptr tuple; + if (subtuples.size() > 0) { + RETURN_NOT_OK(SerializeSequences(subtuples, &tuple)); + } + std::shared_ptr dict; + if (subdicts.size() > 0) { + RETURN_NOT_OK(SerializeDict(subdicts, &dict)); + } + *out = builder.Finish(list, tuple, dict); + return Status::OK(); +} + +#define DESERIALIZE_SEQUENCE(CREATE, SET_ITEM) \ + auto data = std::dynamic_pointer_cast(array); \ + int32_t size = array->length(); \ + PyObject* result = CREATE(stop_idx - start_idx); \ + auto types = std::make_shared(size, data->types()); \ + auto offsets = std::make_shared(size, data->offset_buf()); \ + for (size_t i = start_idx; i < stop_idx; ++i) { \ + if (data->IsNull(i)) { \ + Py_INCREF(Py_None); \ + SET_ITEM(result, i-start_idx, Py_None); \ + } else { \ + int32_t offset = offsets->Value(i); \ + int8_t type = types->Value(i); \ + ArrayPtr arr = data->child(type); \ + SET_ITEM(result, i-start_idx, get_value(arr, offset, type)); \ + } \ + } \ + *out = result; \ + return Status::OK(); + +Status DeserializeList(std::shared_ptr array, int32_t start_idx, int32_t stop_idx, PyObject** out) { + DESERIALIZE_SEQUENCE(PyList_New, PyList_SetItem) +} + +Status DeserializeTuple(std::shared_ptr array, int32_t start_idx, int32_t stop_idx, PyObject** out) { + DESERIALIZE_SEQUENCE(PyTuple_New, PyTuple_SetItem) +} + +Status SerializeDict(std::vector dicts, std::shared_ptr* out) { + DictBuilder result; + std::vector sublists, subtuples, subdicts, dummy; + for (const auto& dict : dicts) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(dict, &pos, &key, &value)) { + RETURN_NOT_OK(append(key, result.keys(), dummy, dummy, dummy)); + DCHECK(dummy.size() == 0); + RETURN_NOT_OK(append(value, result.vals(), sublists, subtuples, subdicts)); + } + } + std::shared_ptr val_list; + if (sublists.size() > 0) { + RETURN_NOT_OK(SerializeSequences(sublists, &val_list)); + } + std::shared_ptr val_tuples; + if (subtuples.size() > 0) { + RETURN_NOT_OK(SerializeSequences(subtuples, &val_tuples)); + } + std::shared_ptr val_dict; + if (subdicts.size() > 0) { + RETURN_NOT_OK(SerializeDict(subdicts, &val_dict)); + } + *out = result.Finish(val_list, val_tuples, val_dict); + return Status::OK(); +} + +Status DeserializeDict(std::shared_ptr array, int32_t start_idx, int32_t stop_idx, PyObject** out) { + auto data = std::dynamic_pointer_cast(array); + // TODO(pcm): error handling, get rid of the temporary copy of the list + PyObject *keys, *vals; + PyObject* result = PyDict_New(); + ARROW_RETURN_NOT_OK(DeserializeList(data->field(0), start_idx, stop_idx, &keys)); + ARROW_RETURN_NOT_OK(DeserializeList(data->field(1), start_idx, stop_idx, &vals)); + for (size_t i = start_idx; i < stop_idx; ++i) { + PyDict_SetItem(result, PyList_GetItem(keys, i - start_idx), PyList_GetItem(vals, i - start_idx)); + } + Py_XDECREF(keys); // PyList_GetItem(keys, ...) incremented the reference count + Py_XDECREF(vals); // PyList_GetItem(vals, ...) incremented the reference count + *out = result; + return Status::OK(); +} + + +} diff --git a/python/src/pynumbuf/adapters/python.h b/python/src/pynumbuf/adapters/python.h new file mode 100644 index 000000000..1fa07f920 --- /dev/null +++ b/python/src/pynumbuf/adapters/python.h @@ -0,0 +1,22 @@ +#ifndef PYNUMBUF_PYTHON_H +#define PYNUMBUF_PYTHON_H + +#include + +#include +#include +#include + +#include "numpy.h" + +namespace numbuf { + +arrow::Status SerializeSequences(std::vector sequences, std::shared_ptr* out); +arrow::Status SerializeDict(std::vector dicts, std::shared_ptr* out); +arrow::Status DeserializeList(std::shared_ptr array, int32_t start_idx, int32_t stop_idx, PyObject** out); +arrow::Status DeserializeTuple(std::shared_ptr array, int32_t start_idx, int32_t stop_idx, PyObject** out); +arrow::Status DeserializeDict(std::shared_ptr array, int32_t start_idx, int32_t stop_idx, PyObject** out); + +} + +#endif diff --git a/python/src/pynumbuf/adapters/scalars.h b/python/src/pynumbuf/adapters/scalars.h new file mode 100644 index 000000000..928289252 --- /dev/null +++ b/python/src/pynumbuf/adapters/scalars.h @@ -0,0 +1,54 @@ +#ifndef PYNUMBUF_SCALARS_H +#define PYNUMBUF_SCALARS_H + +#include + +#include +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API +#include +#include + +#include + +namespace numbuf { + +arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) { + if (PyArray_IsScalar(obj, Bool)) { + return builder.Append(((PyBoolScalarObject *)obj)->obval != 0); + } else if (PyArray_IsScalar(obj, Float)) { + return builder.Append(((PyFloatScalarObject *)obj)->obval); + } else if (PyArray_IsScalar(obj, Double)) { + return builder.Append(((PyDoubleScalarObject *)obj)->obval); + } + int64_t value = 0; + if (PyArray_IsScalar(obj, Byte)) { + value = ((PyByteScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, UByte)) { + value = ((PyUByteScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, Short)) { + value = ((PyShortScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, UShort)) { + value = ((PyUShortScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, Int)) { + value = ((PyIntScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, UInt)) { + value = ((PyUIntScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, Long)) { + value = ((PyLongScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, ULong)) { + value = ((PyULongScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, LongLong)) { + value = ((PyLongLongScalarObject *)obj)->obval; + } else if (PyArray_IsScalar(obj, ULongLong)) { + value = ((PyULongLongScalarObject *)obj)->obval; + } else { + DCHECK(false) << "scalar type not recognized"; + } + return builder.Append(value); +} + +} // namespace + +#endif // PYNUMBUF_SCALARS_H diff --git a/python/src/pynumbuf/memory.h b/python/src/pynumbuf/memory.h new file mode 100644 index 000000000..ebaa22c51 --- /dev/null +++ b/python/src/pynumbuf/memory.h @@ -0,0 +1,47 @@ +#ifndef PYNUMBUF_MEMORY_H +#define PYNUMBUF_MEMORY_H + +#include + +namespace numbuf { + +class BufferSource : public arrow::ipc::MemorySource { + public: + virtual ~BufferSource() {} + + explicit BufferSource(uint8_t* data, int64_t nbytes) + : data_(data), size_(nbytes) {} + + arrow::Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) override { + DCHECK(out); + DCHECK(position + nbytes <= size_) << "position: " << position << " nbytes: " << nbytes << "size: " << size_; + *out = std::make_shared(data_ + position, nbytes); + return arrow::Status::OK(); + } + + arrow::Status Close() override { + return arrow::Status::OK(); + } + + arrow::Status Write(int64_t position, const uint8_t* data, + int64_t nbytes) override { + DCHECK(position >= 0 && position < size_); + DCHECK(position + nbytes <= size_) << "position: " << position << " nbytes: " << nbytes << "size: " << size_; + uint8_t* dst = data_ + position; + memcpy(dst, data, nbytes); + return arrow::Status::OK(); + } + + int64_t Size() const override { + return size_; + } + +private: + uint8_t* data_; + int64_t size_; +}; + +} // namespace + +#endif // PYNUMBUF_MEMORY_H diff --git a/python/src/pynumbuf/numbuf.cc b/python/src/pynumbuf/numbuf.cc new file mode 100644 index 000000000..6b4c524dd --- /dev/null +++ b/python/src/pynumbuf/numbuf.cc @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#define PY_ARRAY_UNIQUE_SYMBOL NUMBUF_ARRAY_API +#include + +#include + +#include + +#include "adapters/python.h" +#include "memory.h" + +using namespace arrow; +using namespace numbuf; + +extern "C" { + +// Error handling + +static PyObject *NumbufError; + +int PyObjectToArrow(PyObject* object, std::shared_ptr **result) { + if (PyCapsule_IsValid(object, "arrow")) { + *result = reinterpret_cast*>(PyCapsule_GetPointer(object, "arrow")); + return 1; + } else { + PyErr_SetString(PyExc_TypeError, "must be an 'arrow' capsule"); + return 0; + } +} + +static void ArrowCapsule_Destructor(PyObject* capsule) { + delete reinterpret_cast*>(PyCapsule_GetPointer(capsule, "arrow")); +} + +std::shared_ptr make_row_batch(std::shared_ptr data) { + auto field = std::make_shared("list", data->type()); + std::shared_ptr schema(new Schema({field})); + return std::shared_ptr(new RowBatch(schema, data->length(), {data})); +} + +/*! Serializes a Python list into an Arrow array. + + \param args + The argument must be a Python list + + \returns + A Python "arrow" capsule containing the arrow::Array +*/ +PyObject* serialize_list(PyObject* self, PyObject* args) { + PyObject* value; + if (!PyArg_ParseTuple(args, "O", &value)) { + return NULL; + } + std::shared_ptr* result = new std::shared_ptr(); + if (PyList_Check(value)) { + Status s = SerializeSequences(std::vector({value}), result); + if (!s.ok()) { + PyErr_SetString(NumbufError, s.ToString().c_str()); + return NULL; + } + return PyCapsule_New(reinterpret_cast(result), "arrow", &ArrowCapsule_Destructor); + } + return NULL; +} + +/*! Number of bytes the serialized version of the object will take. + + \param args + A Python "arrow" capsule containing the arrow::Array + + \returns + Size of the object in memory once it is serialized +*/ +PyObject* get_serialized_size(PyObject* self, PyObject* args) { + std::shared_ptr* data; + if (!PyArg_ParseTuple(args, "O&", &PyObjectToArrow, &data)) { + return NULL; + } + auto batch = make_row_batch(*data); + int64_t size = 0; + ARROW_CHECK_OK(arrow::ipc::GetRowBatchSize(batch.get(), &size)); + return PyInt_FromLong(size); +} + +/*! Serialize an arrow::Array into a buffer. + + \param args + A Python "arrow" capsule containing the arrow::Array and + a python memoryview object the data will be written to + + \return + The arrow metadata offset for the arrow metadata +*/ +PyObject* write_to_buffer(PyObject* self, PyObject* args) { + std::shared_ptr* data; + PyObject* memoryview; + if (!PyArg_ParseTuple(args, "O&O", &PyObjectToArrow, &data, &memoryview)) { + return NULL; + } + if (!PyMemoryView_Check(memoryview)) { + return NULL; + } + auto batch = make_row_batch(*data); + Py_buffer* buffer = PyMemoryView_GET_BUFFER(memoryview); + auto target = std::make_shared(reinterpret_cast(buffer->buf), buffer->len); + int64_t metadata_offset; + ARROW_CHECK_OK(ipc::WriteRowBatch(target.get(), batch.get(), 0, &metadata_offset)); + return PyInt_FromLong(metadata_offset); +} + +/*! Serialize schema metadata associated to and arrow::Array + + \param args + A Python "arrow" capsule containing the arrow::Array + + \return + A bytearray object containing the schema metadata +*/ +PyObject* get_schema_metadata(PyObject* self, PyObject* args) { + std::shared_ptr* data; + if (!PyArg_ParseTuple(args, "O&", &PyObjectToArrow, &data)) { + return NULL; + } + auto batch = make_row_batch(*data); + std::shared_ptr buffer; + ARROW_CHECK_OK(ipc::WriteSchema(batch->schema().get(), &buffer)); + auto ptr = reinterpret_cast(buffer->data()); + return PyByteArray_FromStringAndSize(ptr, buffer->size()); +} + +/*! Read serialized data from buffer and produce an arrow capsule + + \param args + A Python memoryview from which data will be loaded, + a Python bytearray containing the metadata and the metadata_offset + + \return + A Python "arrow" capsule containing the arrow data +*/ +PyObject* read_from_buffer(PyObject* self, PyObject* args) { + PyObject* memoryview; + PyObject* metadata; + int64_t metadata_offset; + if (!PyArg_ParseTuple(args, "OOl", &memoryview, &metadata, &metadata_offset)) { + return NULL; + } + + auto ptr = reinterpret_cast(PyByteArray_AsString(metadata)); + auto schema_buffer = std::make_shared(ptr, PyByteArray_Size(metadata)); + std::shared_ptr message; + ARROW_CHECK_OK(ipc::Message::Open(schema_buffer, &message)); + DCHECK_EQ(ipc::Message::SCHEMA, message->type()); + std::shared_ptr schema_msg = message->GetSchema(); + std::shared_ptr schema; + ARROW_CHECK_OK(schema_msg->GetSchema(&schema)); + + Py_buffer* buffer = PyMemoryView_GET_BUFFER(memoryview); + auto source = std::make_shared(reinterpret_cast(buffer->buf), buffer->len); + std::shared_ptr reader; + ARROW_CHECK_OK(arrow::ipc::RowBatchReader::Open(source.get(), metadata_offset, &reader)); + std::shared_ptr data; + ARROW_CHECK_OK(reader->GetRowBatch(schema, &data)); + + std::shared_ptr* result = new std::shared_ptr(); + *result = data->column(0); + return PyCapsule_New(reinterpret_cast(result), "arrow", &ArrowCapsule_Destructor); +} + +/*! +*/ +PyObject* deserialize_list(PyObject* self, PyObject* args) { + std::shared_ptr* data; + if (!PyArg_ParseTuple(args, "O&", &PyObjectToArrow, &data)) { + return NULL; + } + PyObject* result; + ARROW_CHECK_OK(DeserializeList(*data, 0, (*data)->length(), &result)); + return result; +} + +static PyMethodDef NumbufMethods[] = { + { "serialize_list", serialize_list, METH_VARARGS, "serialize a Python list" }, + { "deserialize_list", deserialize_list, METH_VARARGS, "deserialize a Python list" }, + { "get_serialized_size", get_serialized_size, METH_VARARGS, "get the number of bytes the object will occupy once serialized" }, + { "write_to_buffer", write_to_buffer, METH_VARARGS, "write serialized data to buffer"}, + { "read_from_buffer", read_from_buffer, METH_VARARGS, "read serialized data from buffer"}, + { "get_schema_metadata", get_schema_metadata, METH_VARARGS, "return the schema of an arrow object"}, + { NULL, NULL, 0, NULL } +}; + +PyMODINIT_FUNC initlibnumbuf(void) { + PyObject* m; + m = Py_InitModule3("libnumbuf", NumbufMethods, "Python C Extension for Numbuf"); + char numbuf_error[] = "numbuf.error"; + NumbufError = PyErr_NewException(numbuf_error, NULL, NULL); + Py_INCREF(NumbufError); + PyModule_AddObject(m, "numbuf_error", NumbufError); + import_array(); +} + +} diff --git a/python/test/runtest.py b/python/test/runtest.py new file mode 100644 index 000000000..f3697d51f --- /dev/null +++ b/python/test/runtest.py @@ -0,0 +1,63 @@ +import unittest +import libnumbuf +import numpy as np +from numpy.testing import assert_equal + +TEST_OBJECTS = [[1, "hello", 3.0], 42, 43L, "hello world", 42.0, 1L << 62, + (1.0, "hi"), None, (None, None), ("hello", None), + True, False, (True, False), + {True: "hello", False: "world"}, + {"hello" : "world", 1: 42, 1.0: 45}, {}, + np.int8(3), np.int32(4), np.int64(5), + np.uint8(3), np.uint32(4), np.uint64(5), + np.float32(1.0), np.float64(1.0)] + +class SerializationTests(unittest.TestCase): + + def roundTripTest(self, data): + serialized = libnumbuf.serialize_list(data) + result = libnumbuf.deserialize_list(serialized) + assert_equal(data, result) + + def testSimple(self): + self.roundTripTest([1, 2, 3]) + self.roundTripTest([1.0, 2.0, 3.0]) + self.roundTripTest(['hello', 'world']) + self.roundTripTest([1, 'hello', 1.0]) + self.roundTripTest([{'hello': 1.0, 'world': 42}]) + self.roundTripTest([True, False]) + + def testNone(self): + self.roundTripTest([1, 2, None, 3]) + + def testNested(self): + self.roundTripTest([{"hello": {"world": (1, 2, 3)}}]) + self.roundTripTest([((1,), (1, 2, 3, (4, 5, 6), "string"))]) + self.roundTripTest([{"hello": [1, 2, 3]}]) + self.roundTripTest([{"hello": [1, [2, 3]]}]) + self.roundTripTest([{"hello": (None, 2, [3, 4])}]) + self.roundTripTest([{"hello": (None, 2, [3, 4], np.ndarray([1.0, 2.0, 3.0]))}]) + + def numpyTest(self, t): + a = np.random.randint(0, 10, size=(100, 100)).astype(t) + self.roundTripTest([a]) + + def testArrays(self): + for t in ["int8", "uint8", "int16", "uint16", "int32", "uint32", "float32", "float64"]: + self.numpyTest(t) + + def testRay(self): + for obj in TEST_OBJECTS: + self.roundTripTest([obj]) + + def testBuffer(self): + for (i, obj) in enumerate(TEST_OBJECTS): + x = libnumbuf.serialize_list([1, 2, 3]) + schema = libnumbuf.get_schema_metadata(x) + size = libnumbuf.get_serialized_size(x) + 4096 # INITIAL_METADATA_SIZE in arrow + buff = np.zeros(size, dtype="uint8") + metadata_offset = libnumbuf.write_to_buffer(x, memoryview(buff)) + array = libnumbuf.read_from_buffer(memoryview(buff), schema, metadata_offset) + +if __name__ == "__main__": + unittest.main()