Implement unicode serialization

This commit is contained in:
Philipp Moritz
2016-08-04 15:34:47 -07:00
parent a81dc0c541
commit 8e165d43d4
6 changed files with 85 additions and 52 deletions
+10 -10
View File
@@ -65,34 +65,34 @@ Status SerializeArray(PyArrayObject* array, SequenceBuilder& builder) {
auto data = PyArray_DATA(contiguous);
switch (dtype) {
case NPY_UINT8:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint8_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint8_t*>(data)));
break;
case NPY_INT8:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int8_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int8_t*>(data)));
break;
case NPY_UINT16:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint16_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint16_t*>(data)));
break;
case NPY_INT16:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int16_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int16_t*>(data)));
break;
case NPY_UINT32:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint32_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint32_t*>(data)));
break;
case NPY_INT32:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int32_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int32_t*>(data)));
break;
case NPY_UINT64:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<uint64_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<uint64_t*>(data)));
break;
case NPY_INT64:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<int64_t*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<int64_t*>(data)));
break;
case NPY_FLOAT:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<float*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<float*>(data)));
break;
case NPY_DOUBLE:
RETURN_NOT_OK(builder.Append(dims, reinterpret_cast<double*>(data)));
RETURN_NOT_OK(builder.AppendTensor(dims, reinterpret_cast<double*>(data)));
break;
default:
DCHECK(false) << "numpy data type not recognized: " << dtype;
+26 -7
View File
@@ -15,10 +15,15 @@ PyObject* get_value(ArrayPtr arr, int32_t index, int32_t type) {
return PyBool_FromLong(std::static_pointer_cast<BooleanArray>(arr)->Value(index));
case Type::INT64:
return PyInt_FromLong(std::static_pointer_cast<Int64Array>(arr)->Value(index));
case Type::BINARY: {
int32_t nchars;
const uint8_t* str = std::static_pointer_cast<BinaryArray>(arr)->GetValue(index, &nchars);
return PyString_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
}
case Type::STRING: {
int32_t nchars;
const uint8_t* str = std::static_pointer_cast<StringArray>(arr)->GetValue(index, &nchars);
return PyString_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(str), nchars);
}
case Type::FLOAT:
return PyFloat_FromDouble(std::static_pointer_cast<FloatArray>(arr)->Value(index));
@@ -50,20 +55,34 @@ Status append(PyObject* elem, SequenceBuilder& builder,
std::vector<PyObject*>& subdicts) {
// The bool case must precede the int case (PyInt_Check passes for bools)
if (PyBool_Check(elem)) {
RETURN_NOT_OK(builder.Append(elem == Py_True));
RETURN_NOT_OK(builder.AppendBool(elem == Py_True));
} else if (PyFloat_Check(elem)) {
RETURN_NOT_OK(builder.Append(PyFloat_AS_DOUBLE(elem)));
RETURN_NOT_OK(builder.AppendFloat(PyFloat_AS_DOUBLE(elem)));
} else if (PyLong_Check(elem)) {
int overflow = 0;
int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow);
RETURN_NOT_OK(builder.Append(data));
RETURN_NOT_OK(builder.AppendInt64(data));
if(overflow) {
return Status::NotImplemented("long overflow");
}
} else if (PyInt_Check(elem)) {
RETURN_NOT_OK(builder.Append(static_cast<int64_t>(PyInt_AS_LONG(elem))));
RETURN_NOT_OK(builder.AppendInt64(static_cast<int64_t>(PyInt_AS_LONG(elem))));
} else if (PyString_Check(elem)) {
RETURN_NOT_OK(builder.Append(PyString_AS_STRING(elem), PyString_GET_SIZE(elem)));
auto data = reinterpret_cast<uint8_t*>(PyString_AS_STRING(elem));
auto size = PyString_GET_SIZE(elem);
RETURN_NOT_OK(builder.AppendBytes(data, size));
} else if (PyUnicode_Check(elem)) {
Py_ssize_t size;
#if PY_MAJOR_VERSION >= 3
char* data = PyUnicode_AsUTF8AndSize(elem, &size); // TODO(pcm): Check if this is correct
#else
PyObject* str = PyUnicode_AsUTF8String(elem);
char* data = PyString_AS_STRING(str);
size = PyString_GET_SIZE(str);
#endif
Status s = builder.AppendString(data, size);
Py_XDECREF(str);
RETURN_NOT_OK(s);
} else if (PyList_Check(elem)) {
builder.AppendList(PyList_Size(elem));
sublists.push_back(elem);
@@ -78,7 +97,7 @@ Status append(PyObject* elem, SequenceBuilder& builder,
} else if (PyArray_Check(elem)) {
RETURN_NOT_OK(SerializeArray((PyArrayObject*) elem, builder));
} else if (elem == Py_None) {
RETURN_NOT_OK(builder.Append());
RETURN_NOT_OK(builder.AppendNone());
} else {
std::stringstream ss;
ss << "data type of " << PyString_AS_STRING(PyObject_Repr(elem))
+4 -4
View File
@@ -16,11 +16,11 @@ namespace numbuf {
arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) {
if (PyArray_IsScalar(obj, Bool)) {
return builder.Append(((PyBoolScalarObject *)obj)->obval != 0);
return builder.AppendBool(((PyBoolScalarObject *)obj)->obval != 0);
} else if (PyArray_IsScalar(obj, Float)) {
return builder.Append(((PyFloatScalarObject *)obj)->obval);
return builder.AppendFloat(((PyFloatScalarObject *)obj)->obval);
} else if (PyArray_IsScalar(obj, Double)) {
return builder.Append(((PyDoubleScalarObject *)obj)->obval);
return builder.AppendDouble(((PyDoubleScalarObject *)obj)->obval);
}
int64_t value = 0;
if (PyArray_IsScalar(obj, Byte)) {
@@ -46,7 +46,7 @@ arrow::Status AppendScalar(PyObject* obj, SequenceBuilder& builder) {
} else {
DCHECK(false) << "scalar type not recognized";
}
return builder.Append(value);
return builder.AppendInt64(value);
}
} // namespace